[core] overhaul memory profiling and fix backward compatibility (#10511)

Signed-off-by: youkaichao <youkaichao@gmail.com>
This commit is contained in:
youkaichao
2024-12-16 13:32:25 -08:00
committed by GitHub
parent efbce85f4d
commit 551603feff
8 changed files with 236 additions and 60 deletions

View File

@@ -5,11 +5,13 @@ from functools import partial
from typing import AsyncIterator, Tuple
import pytest
import torch
from vllm.utils import (FlexibleArgumentParser, StoreBoolean, deprecate_kwargs,
get_open_port, merge_async_iterators, supports_kw)
get_open_port, memory_profiling, merge_async_iterators,
supports_kw)
from .utils import error_on_warning
from .utils import error_on_warning, fork_new_process_for_each_test
@pytest.mark.asyncio
@@ -270,3 +272,41 @@ def test_supports_kw(callable,kw_name,requires_kw_only,
requires_kw_only=requires_kw_only,
allow_var_kwargs=allow_var_kwargs
) == is_supported
@fork_new_process_for_each_test
def test_memory_profiling():
# Fake out some model loading + inference memory usage to test profiling
# Memory used by other processes will show up as cuda usage outside of torch
from vllm.distributed.device_communicators.cuda_wrapper import (
CudaRTLibrary)
lib = CudaRTLibrary()
# 512 MiB allocation outside of this instance
handle1 = lib.cudaMalloc(512 * 1024 * 1024)
baseline_memory_in_bytes = \
torch.cuda.mem_get_info()[1] - torch.cuda.mem_get_info()[0]
# load weights
weights = torch.randn(128, 1024, 1024, device='cuda', dtype=torch.float32)
weights_memory_in_bytes = 128 * 1024 * 1024 * 4 # 512 MiB
with memory_profiling(baseline_memory_in_bytes=baseline_memory_in_bytes,
weights_memory_in_bytes=weights_memory_in_bytes) as result:
# make a memory spike, 1 GiB
spike = torch.randn(256, 1024, 1024, device='cuda', dtype=torch.float32)
del spike
# Add some extra non-torch memory 256 MiB (simulate NCCL)
handle2 = lib.cudaMalloc(256 * 1024 * 1024)
# Check that the memory usage is within 5% of the expected values
non_torch_ratio = result.non_torch_increase_in_bytes / (256 * 1024 * 1024) # noqa
torch_peak_ratio = result.torch_peak_increase_in_bytes / (1024 * 1024 * 1024) # noqa
assert abs(non_torch_ratio - 1) <= 0.05
assert abs(torch_peak_ratio - 1) <= 0.05
del weights
lib.cudaFree(handle1)
lib.cudaFree(handle2)