[misc] improve memory profiling (#11809)

Signed-off-by: youkaichao <youkaichao@gmail.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-01-08 14:36:03 +08:00
parent ef68eb28d8
commit 889e662eae
4 changed files with 94 additions and 8 deletions
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1742,10 +1742,10 @@ class MemorySnapshot:
    timestamp: float = 0.0

    def measure(self):
-        self.torch_peak_in_bytes = torch.cuda.memory_stats(
-        )["allocated_bytes.all.peak"]
-        self.torch_memory_in_bytes = torch.cuda.memory_stats(
-        )["allocated_bytes.all.current"]
+        self.torch_peak_in_bytes = torch.cuda.max_memory_reserved()
+        # torch.cuda.memory_reserved() is how many bytes
+        # PyTorch gets from cuda (by calling cudaMalloc, etc.)
+        self.torch_memory_in_bytes = torch.cuda.memory_reserved()
        self.timestamp = time.time()

    def __sub__(self, other: "MemorySnapshot") -> "MemorySnapshot":
@@ -1822,10 +1822,10 @@ def memory_profiling(

    The memory used for loading weights (a.) is directly given from the argument `weights_memory_in_bytes`.

-    The increase of ``torch.cuda.memory_stats()["allocated_bytes.all.peak"]` after profiling gives (b.).
+    The increase of `torch.cuda.memory_stats()["allocated_bytes.all.peak"]` after profiling gives (b.).

    (c.) is tricky. We measure the total memory used in this GPU (`torch.cuda.mem_get_info()[1] - torch.cuda.mem_get_info()[0]`),
-    subtract the baseline memory, the memory used by the model weights, and diff of `torch.cuda.memory_stats()["allocated_bytes.all.current"]`.
+    subtract the baseline memory, the memory used by the model weights, and diff of `torch.cuda.memory_reserved()`.
    """ # noqa
    torch.cuda.reset_peak_memory_stats()