[core] overhaul memory profiling and fix backward compatibility (#10511)
Signed-off-by: youkaichao <youkaichao@gmail.com>
This commit is contained in:
@@ -31,10 +31,6 @@ def test_gpu_memory_profiling():
|
||||
is_driver_worker=True,
|
||||
)
|
||||
|
||||
# Load the model so we can profile it
|
||||
worker.init_device()
|
||||
worker.load_model()
|
||||
|
||||
# Set 10GiB as the total gpu ram to be device-agnostic
|
||||
def mock_mem_info():
|
||||
current_usage = torch.cuda.memory_stats(
|
||||
@@ -46,20 +42,24 @@ def test_gpu_memory_profiling():
|
||||
|
||||
from unittest.mock import patch
|
||||
with patch("torch.cuda.mem_get_info", side_effect=mock_mem_info):
|
||||
# Load the model so we can profile it
|
||||
worker.init_device()
|
||||
worker.load_model()
|
||||
gpu_blocks, _ = worker.determine_num_available_blocks()
|
||||
|
||||
# Peak vram usage by torch should be 0.7077 GiB
|
||||
# Peak vram usage by torch should be 0.47 GiB
|
||||
# Model weights take 0.25 GiB
|
||||
# No memory should be allocated outside of torch
|
||||
# 9.0 GiB should be the utilization target
|
||||
# 8.2923 GiB should be available for the KV cache
|
||||
# 8.28 GiB should be available for the KV cache
|
||||
block_size = CacheEngine.get_cache_block_size(
|
||||
engine_config.cache_config, engine_config.model_config,
|
||||
engine_config.parallel_config)
|
||||
|
||||
expected_blocks = (8.2923 * 1024**3) // block_size
|
||||
expected_blocks = (8.28 * 1024**3) // block_size
|
||||
|
||||
# Check within a small tolerance for portability
|
||||
# Hardware, kernel, or dependency changes could all affect memory
|
||||
# utilization.
|
||||
# A 10 block tolerance here should be about 6MB of wiggle room.
|
||||
assert abs(gpu_blocks - expected_blocks) < 10
|
||||
# A 100 block tolerance here should be about 60MB of wiggle room.
|
||||
assert abs(gpu_blocks - expected_blocks) < 100
|
||||
|
||||
Reference in New Issue
Block a user