[Log] add log about gpu worker init snapshot and requested memory (#29493)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
This commit is contained in:
@@ -14,6 +14,10 @@ import torch.types
|
|||||||
from .mem_constants import GiB_bytes
|
from .mem_constants import GiB_bytes
|
||||||
|
|
||||||
|
|
||||||
|
def format_gib(b: int) -> float:
|
||||||
|
return round(b / GiB_bytes, 2)
|
||||||
|
|
||||||
|
|
||||||
@cache
|
@cache
|
||||||
def get_max_shared_memory_bytes(gpu: int = 0) -> int:
|
def get_max_shared_memory_bytes(gpu: int = 0) -> int:
|
||||||
"""Returns the maximum shared memory per thread block in bytes."""
|
"""Returns the maximum shared memory per thread block in bytes."""
|
||||||
@@ -146,6 +150,18 @@ class MemorySnapshot:
|
|||||||
auto_measure=False,
|
auto_measure=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def __repr__(self) -> str:
|
||||||
|
return (
|
||||||
|
f"torch_peak={format_gib(self.torch_peak)}GiB, "
|
||||||
|
f"free_memory={format_gib(self.free_memory)}GiB, "
|
||||||
|
f"total_memory={format_gib(self.total_memory)}GiB, "
|
||||||
|
f"cuda_memory={format_gib(self.cuda_memory)}GiB, "
|
||||||
|
f"torch_memory={format_gib(self.torch_memory)}GiB, "
|
||||||
|
f"non_torch_memory={format_gib(self.non_torch_memory)}GiB, "
|
||||||
|
f"timestamp={self.timestamp}, "
|
||||||
|
f"auto_measure={self.auto_measure}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class MemoryProfilingResult:
|
class MemoryProfilingResult:
|
||||||
@@ -168,12 +184,12 @@ class MemoryProfilingResult:
|
|||||||
return (
|
return (
|
||||||
f"Memory profiling takes {self.profile_time:.2f} seconds. "
|
f"Memory profiling takes {self.profile_time:.2f} seconds. "
|
||||||
f"Total non KV cache memory: "
|
f"Total non KV cache memory: "
|
||||||
f"{(self.non_kv_cache_memory / GiB_bytes):.2f}GiB; "
|
f"{format_gib(self.non_kv_cache_memory)}GiB; "
|
||||||
f"torch peak memory increase: "
|
f"torch peak memory increase: "
|
||||||
f"{(self.torch_peak_increase / GiB_bytes):.2f}GiB; "
|
f"{format_gib(self.torch_peak_increase)}GiB; "
|
||||||
f"non-torch forward increase memory: "
|
f"non-torch forward increase memory: "
|
||||||
f"{(self.non_torch_increase / GiB_bytes):.2f}GiB; "
|
f"{format_gib(self.non_torch_increase)}GiB; "
|
||||||
f"weights memory: {(self.weights_memory / GiB_bytes):.2f}GiB."
|
f"weights memory: {format_gib(self.weights_memory)}GiB."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -40,8 +40,7 @@ from vllm.platforms import current_platform
|
|||||||
from vllm.profiler.wrapper import CudaProfilerWrapper, TorchProfilerWrapper
|
from vllm.profiler.wrapper import CudaProfilerWrapper, TorchProfilerWrapper
|
||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
from vllm.tasks import SupportedTask
|
from vllm.tasks import SupportedTask
|
||||||
from vllm.utils.mem_constants import GiB_bytes
|
from vllm.utils.mem_utils import MemorySnapshot, format_gib, memory_profiling
|
||||||
from vllm.utils.mem_utils import MemorySnapshot, memory_profiling
|
|
||||||
from vllm.utils.torch_utils import set_random_seed
|
from vllm.utils.torch_utils import set_random_seed
|
||||||
from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
|
from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
|
||||||
from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType
|
from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType
|
||||||
@@ -132,9 +131,9 @@ class Worker(WorkerBase):
|
|||||||
used_bytes = total - free_bytes_after_sleep
|
used_bytes = total - free_bytes_after_sleep
|
||||||
assert freed_bytes >= 0, "Memory usage increased after sleeping."
|
assert freed_bytes >= 0, "Memory usage increased after sleeping."
|
||||||
logger.info(
|
logger.info(
|
||||||
"Sleep mode freed %.2f GiB memory, %.2f GiB memory is still in use.",
|
"Sleep mode freed %f GiB memory, %f GiB memory is still in use.",
|
||||||
freed_bytes / GiB_bytes,
|
format_gib(freed_bytes),
|
||||||
used_bytes / GiB_bytes,
|
format_gib(used_bytes),
|
||||||
)
|
)
|
||||||
|
|
||||||
def wake_up(self, tags: list[str] | None = None) -> None:
|
def wake_up(self, tags: list[str] | None = None) -> None:
|
||||||
@@ -239,6 +238,10 @@ class Worker(WorkerBase):
|
|||||||
# take current memory snapshot
|
# take current memory snapshot
|
||||||
self.init_snapshot = init_snapshot = MemorySnapshot(device=self.device)
|
self.init_snapshot = init_snapshot = MemorySnapshot(device=self.device)
|
||||||
self.requested_memory = request_memory(init_snapshot, self.cache_config)
|
self.requested_memory = request_memory(init_snapshot, self.cache_config)
|
||||||
|
logger.debug("worker init memory snapshot: %r", self.init_snapshot)
|
||||||
|
logger.debug(
|
||||||
|
"worker requested memory: %sGiB", format_gib(self.requested_memory)
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
raise RuntimeError(f"Not support device type: {self.device_config.device}")
|
raise RuntimeError(f"Not support device type: {self.device_config.device}")
|
||||||
|
|
||||||
@@ -293,15 +296,14 @@ class Worker(WorkerBase):
|
|||||||
You may limit the usage of GPU memory
|
You may limit the usage of GPU memory
|
||||||
by adjusting the `gpu_memory_utilization` parameter.
|
by adjusting the `gpu_memory_utilization` parameter.
|
||||||
"""
|
"""
|
||||||
GiB = lambda b: b / GiB_bytes
|
|
||||||
if kv_cache_memory_bytes := self.cache_config.kv_cache_memory_bytes:
|
if kv_cache_memory_bytes := self.cache_config.kv_cache_memory_bytes:
|
||||||
# still need a profile run which compiles the model for
|
# still need a profile run which compiles the model for
|
||||||
# max_num_batched_tokens
|
# max_num_batched_tokens
|
||||||
self.model_runner.profile_run()
|
self.model_runner.profile_run()
|
||||||
|
|
||||||
msg = (
|
msg = (
|
||||||
f"Initial free memory {GiB(self.init_snapshot.free_memory):.2f} "
|
f"Initial free memory {format_gib(self.init_snapshot.free_memory)} "
|
||||||
f"GiB, reserved {GiB(kv_cache_memory_bytes):.2f} GiB memory for "
|
f"GiB, reserved {format_gib(kv_cache_memory_bytes)} GiB memory for "
|
||||||
"KV Cache as specified by kv_cache_memory_bytes config and "
|
"KV Cache as specified by kv_cache_memory_bytes config and "
|
||||||
"skipped memory profiling. This does not respect the "
|
"skipped memory profiling. This does not respect the "
|
||||||
"gpu_memory_utilization config. Only use kv_cache_memory_bytes "
|
"gpu_memory_utilization config. Only use kv_cache_memory_bytes "
|
||||||
@@ -333,8 +335,8 @@ class Worker(WorkerBase):
|
|||||||
# GPU did not change their memory usage during the profiling.
|
# GPU did not change their memory usage during the profiling.
|
||||||
assert self.init_snapshot.free_memory > free_gpu_memory, (
|
assert self.init_snapshot.free_memory > free_gpu_memory, (
|
||||||
"Error in memory profiling. "
|
"Error in memory profiling. "
|
||||||
f"Initial free memory {GiB(self.init_snapshot.free_memory)} GiB, "
|
f"Initial free memory {format_gib(self.init_snapshot.free_memory)} GiB, "
|
||||||
f"current free memory {GiB(free_gpu_memory)} GiB. "
|
f"current free memory {format_gib(free_gpu_memory)} GiB. "
|
||||||
"This happens when other processes sharing the same container "
|
"This happens when other processes sharing the same container "
|
||||||
"release GPU memory while vLLM is profiling during initialization. "
|
"release GPU memory while vLLM is profiling during initialization. "
|
||||||
"To fix this, ensure consistent GPU memory allocation or "
|
"To fix this, ensure consistent GPU memory allocation or "
|
||||||
@@ -346,21 +348,20 @@ class Worker(WorkerBase):
|
|||||||
|
|
||||||
unrequested_memory = self.init_snapshot.free_memory - self.requested_memory
|
unrequested_memory = self.init_snapshot.free_memory - self.requested_memory
|
||||||
logger.debug(
|
logger.debug(
|
||||||
"Initial free memory: %.2f GiB; Requested memory: %.2f (util), %.2f GiB",
|
"Initial free memory: %f GiB; Requested memory: %f (util), %f GiB",
|
||||||
GiB(self.init_snapshot.free_memory),
|
format_gib(self.init_snapshot.free_memory),
|
||||||
self.cache_config.gpu_memory_utilization,
|
self.cache_config.gpu_memory_utilization,
|
||||||
GiB(self.requested_memory),
|
format_gib(self.requested_memory),
|
||||||
)
|
)
|
||||||
logger.debug(
|
logger.debug(
|
||||||
"Free memory after profiling: %.2f GiB (total), "
|
"Free memory after profiling: %f GiB (total), %f GiB (within requested)",
|
||||||
"%.2f GiB (within requested)",
|
format_gib(free_gpu_memory),
|
||||||
GiB(free_gpu_memory),
|
format_gib(free_gpu_memory - unrequested_memory),
|
||||||
GiB(free_gpu_memory - unrequested_memory),
|
|
||||||
)
|
)
|
||||||
logger.debug(profile_result)
|
logger.debug(profile_result)
|
||||||
logger.info_once(
|
logger.info_once(
|
||||||
"Available KV cache memory: %.2f GiB",
|
"Available KV cache memory: %f GiB",
|
||||||
GiB(self.available_kv_cache_memory_bytes),
|
format_gib(self.available_kv_cache_memory_bytes),
|
||||||
scope="local",
|
scope="local",
|
||||||
)
|
)
|
||||||
gc.collect()
|
gc.collect()
|
||||||
@@ -467,7 +468,6 @@ class Worker(WorkerBase):
|
|||||||
# CUDAGraph memory size and may not utilize all gpu memory.
|
# CUDAGraph memory size and may not utilize all gpu memory.
|
||||||
# Users may want fine-grained control to specify kv cache
|
# Users may want fine-grained control to specify kv cache
|
||||||
# memory size.
|
# memory size.
|
||||||
GiB = lambda b: round(b / GiB_bytes, 2)
|
|
||||||
|
|
||||||
# empirically observed that the memory profiling may
|
# empirically observed that the memory profiling may
|
||||||
# slightly underestimate the memory consumption.
|
# slightly underestimate the memory consumption.
|
||||||
@@ -492,24 +492,24 @@ class Worker(WorkerBase):
|
|||||||
|
|
||||||
msg = (
|
msg = (
|
||||||
f"Free memory on device "
|
f"Free memory on device "
|
||||||
f"({GiB(self.init_snapshot.free_memory)}/"
|
f"({format_gib(self.init_snapshot.free_memory)}/"
|
||||||
f"{GiB(self.init_snapshot.total_memory)} GiB) on startup. "
|
f"{format_gib(self.init_snapshot.total_memory)} GiB) on startup. "
|
||||||
f"Desired GPU memory utilization is "
|
f"Desired GPU memory utilization is "
|
||||||
f"({self.cache_config.gpu_memory_utilization}, "
|
f"({self.cache_config.gpu_memory_utilization}, "
|
||||||
f"{GiB(self.requested_memory)} GiB). "
|
f"{format_gib(self.requested_memory)} GiB). "
|
||||||
f"Actual usage is {GiB(self.model_runner.model_memory_usage)} "
|
f"Actual usage is {format_gib(self.model_runner.model_memory_usage)} "
|
||||||
f"GiB for weight, {GiB(self.peak_activation_memory)} GiB "
|
f"GiB for weight, {format_gib(self.peak_activation_memory)} GiB "
|
||||||
f"for peak activation, {GiB(self.non_torch_memory)} GiB "
|
f"for peak activation, {format_gib(self.non_torch_memory)} GiB "
|
||||||
f"for non-torch memory, and {GiB(cuda_graph_memory_bytes)} "
|
f"for non-torch memory, and {format_gib(cuda_graph_memory_bytes)} "
|
||||||
f"GiB for CUDAGraph memory. Replace gpu_memory_utilization "
|
f"GiB for CUDAGraph memory. Replace gpu_memory_utilization "
|
||||||
f"config with `--kv-cache-memory="
|
f"config with `--kv-cache-memory="
|
||||||
f"{kv_cache_memory_bytes_to_requested_limit}` "
|
f"{kv_cache_memory_bytes_to_requested_limit}` "
|
||||||
f"({GiB(kv_cache_memory_bytes_to_requested_limit)} GiB) to fit "
|
f"({format_gib(kv_cache_memory_bytes_to_requested_limit)} GiB) to fit "
|
||||||
f"into requested memory, or `--kv-cache-memory="
|
f"into requested memory, or `--kv-cache-memory="
|
||||||
f"{kv_cache_memory_bytes_to_gpu_limit}` "
|
f"{kv_cache_memory_bytes_to_gpu_limit}` "
|
||||||
f"({GiB(kv_cache_memory_bytes_to_gpu_limit)} GiB) to fully "
|
f"({format_gib(kv_cache_memory_bytes_to_gpu_limit)} GiB) to fully "
|
||||||
f"utilize gpu memory. Current kv cache memory in use is "
|
f"utilize gpu memory. Current kv cache memory in use is "
|
||||||
f"{GiB(self.available_kv_cache_memory_bytes)} GiB."
|
f"{format_gib(self.available_kv_cache_memory_bytes)} GiB."
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.debug(msg)
|
logger.debug(msg)
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
import math
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
|
|
||||||
@@ -15,8 +16,7 @@ from vllm.model_executor.models.utils import extract_layer_index
|
|||||||
from vllm.multimodal.cache import processor_only_cache_from_config
|
from vllm.multimodal.cache import processor_only_cache_from_config
|
||||||
from vllm.multimodal.registry import MultiModalRegistry
|
from vllm.multimodal.registry import MultiModalRegistry
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.utils.mem_constants import GiB_bytes
|
from vllm.utils.mem_utils import MemorySnapshot, format_gib
|
||||||
from vllm.utils.mem_utils import MemorySnapshot
|
|
||||||
from vllm.v1.attention.backends.utils import AttentionMetadataBuilder
|
from vllm.v1.attention.backends.utils import AttentionMetadataBuilder
|
||||||
from vllm.v1.core.encoder_cache_manager import compute_mm_encoder_budget
|
from vllm.v1.core.encoder_cache_manager import compute_mm_encoder_budget
|
||||||
from vllm.v1.kv_cache_interface import KVCacheGroupSpec, KVCacheSpec
|
from vllm.v1.kv_cache_interface import KVCacheGroupSpec, KVCacheSpec
|
||||||
@@ -250,22 +250,23 @@ def gather_mm_placeholders(
|
|||||||
return placeholders[is_embed]
|
return placeholders[is_embed]
|
||||||
|
|
||||||
|
|
||||||
def request_memory(init_snapshot: MemorySnapshot, cache_config: CacheConfig) -> float:
|
def request_memory(init_snapshot: MemorySnapshot, cache_config: CacheConfig) -> int:
|
||||||
"""
|
"""
|
||||||
Calculate the amount of memory required by vLLM, then validate
|
Calculate the amount of memory required by vLLM, then validate
|
||||||
that the current amount of free memory is sufficient for that.
|
that the current amount of free memory is sufficient for that.
|
||||||
"""
|
"""
|
||||||
requested_memory = init_snapshot.total_memory * cache_config.gpu_memory_utilization
|
requested_memory = math.ceil(
|
||||||
|
init_snapshot.total_memory * cache_config.gpu_memory_utilization
|
||||||
|
)
|
||||||
|
|
||||||
if init_snapshot.free_memory < requested_memory:
|
if init_snapshot.free_memory < requested_memory:
|
||||||
GiB = lambda b: round(b / GiB_bytes, 2)
|
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Free memory on device {init_snapshot.device_} "
|
f"Free memory on device {init_snapshot.device_} "
|
||||||
f"({GiB(init_snapshot.free_memory)}/"
|
f"({format_gib(init_snapshot.free_memory)}/"
|
||||||
f"{GiB(init_snapshot.total_memory)} GiB) on startup "
|
f"{format_gib(init_snapshot.total_memory)} GiB) on startup "
|
||||||
f"is less than desired GPU memory utilization "
|
f"is less than desired GPU memory utilization "
|
||||||
f"({cache_config.gpu_memory_utilization}, "
|
f"({cache_config.gpu_memory_utilization}, "
|
||||||
f"{GiB(requested_memory)} GiB). Decrease GPU memory "
|
f"{format_gib(requested_memory)} GiB). Decrease GPU memory "
|
||||||
f"utilization or reduce GPU memory used by other processes."
|
f"utilization or reduce GPU memory used by other processes."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user