[core] overhaul memory profiling and fix backward compatibility (#10511)
Signed-off-by: youkaichao <youkaichao@gmail.com>
This commit is contained in:
@@ -5,11 +5,13 @@ from functools import partial
|
||||
from typing import AsyncIterator, Tuple
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.utils import (FlexibleArgumentParser, StoreBoolean, deprecate_kwargs,
|
||||
get_open_port, merge_async_iterators, supports_kw)
|
||||
get_open_port, memory_profiling, merge_async_iterators,
|
||||
supports_kw)
|
||||
|
||||
from .utils import error_on_warning
|
||||
from .utils import error_on_warning, fork_new_process_for_each_test
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@@ -270,3 +272,41 @@ def test_supports_kw(callable,kw_name,requires_kw_only,
|
||||
requires_kw_only=requires_kw_only,
|
||||
allow_var_kwargs=allow_var_kwargs
|
||||
) == is_supported
|
||||
|
||||
|
||||
@fork_new_process_for_each_test
|
||||
def test_memory_profiling():
|
||||
# Fake out some model loading + inference memory usage to test profiling
|
||||
# Memory used by other processes will show up as cuda usage outside of torch
|
||||
from vllm.distributed.device_communicators.cuda_wrapper import (
|
||||
CudaRTLibrary)
|
||||
lib = CudaRTLibrary()
|
||||
# 512 MiB allocation outside of this instance
|
||||
handle1 = lib.cudaMalloc(512 * 1024 * 1024)
|
||||
|
||||
baseline_memory_in_bytes = \
|
||||
torch.cuda.mem_get_info()[1] - torch.cuda.mem_get_info()[0]
|
||||
|
||||
# load weights
|
||||
|
||||
weights = torch.randn(128, 1024, 1024, device='cuda', dtype=torch.float32)
|
||||
|
||||
weights_memory_in_bytes = 128 * 1024 * 1024 * 4 # 512 MiB
|
||||
|
||||
with memory_profiling(baseline_memory_in_bytes=baseline_memory_in_bytes,
|
||||
weights_memory_in_bytes=weights_memory_in_bytes) as result:
|
||||
# make a memory spike, 1 GiB
|
||||
spike = torch.randn(256, 1024, 1024, device='cuda', dtype=torch.float32)
|
||||
del spike
|
||||
|
||||
# Add some extra non-torch memory 256 MiB (simulate NCCL)
|
||||
handle2 = lib.cudaMalloc(256 * 1024 * 1024)
|
||||
|
||||
# Check that the memory usage is within 5% of the expected values
|
||||
non_torch_ratio = result.non_torch_increase_in_bytes / (256 * 1024 * 1024) # noqa
|
||||
torch_peak_ratio = result.torch_peak_increase_in_bytes / (1024 * 1024 * 1024) # noqa
|
||||
assert abs(non_torch_ratio - 1) <= 0.05
|
||||
assert abs(torch_peak_ratio - 1) <= 0.05
|
||||
del weights
|
||||
lib.cudaFree(handle1)
|
||||
lib.cudaFree(handle2)
|
||||
|
||||
Reference in New Issue
Block a user