[core] further polish memory profiling (#12126)

Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-01-18 12:25:08 +08:00
parent c09503ddd6
commit da02cb4b27
3 changed files with 85 additions and 67 deletions
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -9,10 +9,10 @@ import torch
 from vllm_test_utils import monitor

 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
-from vllm.utils import (FlexibleArgumentParser, PlaceholderModule,
-                        StoreBoolean, bind_kv_cache, deprecate_kwargs,
-                        get_open_port, memory_profiling, merge_async_iterators,
-                        supports_kw)
+from vllm.utils import (FlexibleArgumentParser, MemorySnapshot,
+                        PlaceholderModule, StoreBoolean, bind_kv_cache,
+                        deprecate_kwargs, get_open_port, memory_profiling,
+                        merge_async_iterators, supports_kw)

 from .utils import error_on_warning, fork_new_process_for_each_test

@@ -284,14 +284,13 @@ def test_memory_profiling():
    # 512 MiB allocation outside of this instance
    handle1 = lib.cudaMalloc(512 * 1024 * 1024)

-    baseline_memory_in_bytes = \
-        torch.cuda.mem_get_info()[1] - torch.cuda.mem_get_info()[0]
+    baseline_snapshot = MemorySnapshot()

    # load weights

    weights = torch.randn(128, 1024, 1024, device='cuda', dtype=torch.float32)

-    weights_memory_in_bytes = 128 * 1024 * 1024 * 4 # 512 MiB
+    weights_memory = 128 * 1024 * 1024 * 4 # 512 MiB

    def measure_current_non_torch():
        free, total = torch.cuda.mem_get_info()
@@ -300,8 +299,8 @@ def test_memory_profiling():
        current_non_torch = current_used - current_torch
        return current_non_torch

-    with memory_profiling(baseline_memory_in_bytes=baseline_memory_in_bytes,
-    weights_memory_in_bytes=weights_memory_in_bytes) as result, \
+    with memory_profiling(baseline_snapshot=baseline_snapshot,
+    weights_memory=weights_memory) as result, \
        monitor(measure_current_non_torch) as monitored_values:
        # make a memory spike, 1 GiB
        spike = torch.randn(256, 1024, 1024, device='cuda', dtype=torch.float32)
@@ -316,13 +315,12 @@ def test_memory_profiling():
    assert measured_diff == 256 * 1024 * 1024

    # Check that the memory usage is within 5% of the expected values
-    # 5% tolerance is caused by PyTorch caching allocator,
-    # we cannot control PyTorch's behavior of its internal buffers,
+    # 5% tolerance is caused by cuda runtime.
+    # we cannot control cuda runtime in the granularity of bytes,
    # which causes a small error (<10 MiB in practice)
-    non_torch_ratio = result.non_torch_increase_in_bytes / (256 * 1024 * 1024) # noqa
-    torch_peak_ratio = result.torch_peak_increase_in_bytes / (1024 * 1024 * 1024) # noqa
+    non_torch_ratio = result.non_torch_increase / (256 * 1024 * 1024) # noqa
    assert abs(non_torch_ratio - 1) <= 0.05
-    assert abs(torch_peak_ratio - 1) <= 0.05
+    assert result.torch_peak_increase == 1024 * 1024 * 1024
    del weights
    lib.cudaFree(handle1)
    lib.cudaFree(handle2)