[core] overhaul memory profiling and fix backward compatibility (#10511)

Signed-off-by: youkaichao <youkaichao@gmail.com>
This commit is contained in:
youkaichao
2024-12-16 13:32:25 -08:00
committed by GitHub
parent efbce85f4d
commit 551603feff
8 changed files with 236 additions and 60 deletions

View File

@@ -31,10 +31,6 @@ def test_gpu_memory_profiling():
is_driver_worker=True,
)
# Load the model so we can profile it
worker.init_device()
worker.load_model()
# Set 10GiB as the total gpu ram to be device-agnostic
def mock_mem_info():
current_usage = torch.cuda.memory_stats(
@@ -46,20 +42,24 @@ def test_gpu_memory_profiling():
from unittest.mock import patch
with patch("torch.cuda.mem_get_info", side_effect=mock_mem_info):
# Load the model so we can profile it
worker.init_device()
worker.load_model()
gpu_blocks, _ = worker.determine_num_available_blocks()
# Peak vram usage by torch should be 0.7077 GiB
# Peak vram usage by torch should be 0.47 GiB
# Model weights take 0.25 GiB
# No memory should be allocated outside of torch
# 9.0 GiB should be the utilization target
# 8.2923 GiB should be available for the KV cache
# 8.28 GiB should be available for the KV cache
block_size = CacheEngine.get_cache_block_size(
engine_config.cache_config, engine_config.model_config,
engine_config.parallel_config)
expected_blocks = (8.2923 * 1024**3) // block_size
expected_blocks = (8.28 * 1024**3) // block_size
# Check within a small tolerance for portability
# Hardware, kernel, or dependency changes could all affect memory
# utilization.
# A 10 block tolerance here should be about 6MB of wiggle room.
assert abs(gpu_blocks - expected_blocks) < 10
# A 100 block tolerance here should be about 60MB of wiggle room.
assert abs(gpu_blocks - expected_blocks) < 100