[misc] improve memory profiling (#11809)

Signed-off-by: youkaichao <youkaichao@gmail.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-01-08 14:36:03 +08:00
parent ef68eb28d8
commit 889e662eae
4 changed files with 94 additions and 8 deletions
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -5,6 +5,7 @@ from typing import AsyncIterator, Tuple

 import pytest
 import torch
+from vllm_test_utils import monitor

 from vllm.utils import (FlexibleArgumentParser, StoreBoolean, deprecate_kwargs,
                        get_open_port, memory_profiling, merge_async_iterators,
@@ -289,8 +290,16 @@ def test_memory_profiling():

    weights_memory_in_bytes = 128 * 1024 * 1024 * 4 # 512 MiB

+    def measure_current_non_torch():
+        free, total = torch.cuda.mem_get_info()
+        current_used = total - free
+        current_torch = torch.cuda.memory_reserved()
+        current_non_torch = current_used - current_torch
+        return current_non_torch
+
    with memory_profiling(baseline_memory_in_bytes=baseline_memory_in_bytes,
-    weights_memory_in_bytes=weights_memory_in_bytes) as result:
+    weights_memory_in_bytes=weights_memory_in_bytes) as result, \
+        monitor(measure_current_non_torch) as monitored_values:
        # make a memory spike, 1 GiB
        spike = torch.randn(256, 1024, 1024, device='cuda', dtype=torch.float32)
        del spike
@@ -298,7 +307,15 @@ def test_memory_profiling():
        # Add some extra non-torch memory 256 MiB (simulate NCCL)
        handle2 = lib.cudaMalloc(256 * 1024 * 1024)

+    # this is an analytic value, it is exact,
+    # we only have 256 MiB non-torch memory increase
+    measured_diff = monitored_values.values[-1] - monitored_values.values[0]
+    assert measured_diff == 256 * 1024 * 1024
+
    # Check that the memory usage is within 5% of the expected values
+    # 5% tolerance is caused by PyTorch caching allocator,
+    # we cannot control PyTorch's behavior of its internal buffers,
+    # which causes a small error (<10 MiB in practice)
    non_torch_ratio = result.non_torch_increase_in_bytes / (256 * 1024 * 1024) # noqa
    torch_peak_ratio = result.torch_peak_increase_in_bytes / (1024 * 1024 * 1024) # noqa
    assert abs(non_torch_ratio - 1) <= 0.05