[Bugfix][Core] Use torch.cuda.memory_stats() to profile peak memory usage (#9352)

Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
2024-10-17 21:47:27 -05:00
parent 48138a8415
commit de4008e2ab
4 changed files with 122 additions and 17 deletions
--- a/tests/entrypoints/llm/test_lazy_outlines.py
+++ b/tests/entrypoints/llm/test_lazy_outlines.py
@@ -26,10 +26,12 @@ def test_lazy_outlines(sample_regex):
    # make sure outlines is not imported
    assert 'outlines' not in sys.modules

+    # The second LLM needs to request a higher gpu_memory_utilization because
+    # the first LLM has already allocated a full 30% of the gpu memory.
    llm = LLM(model="facebook/opt-125m",
              enforce_eager=True,
              guided_decoding_backend="lm-format-enforcer",
-              gpu_memory_utilization=0.3)
+              gpu_memory_utilization=0.6)
    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
    outputs = llm.generate(
        prompts=[
--- a/tests/entrypoints/offline_mode/test_offline_mode.py
+++ b/tests/entrypoints/offline_mode/test_offline_mode.py
@@ -44,7 +44,7 @@ def test_offline_mode(llm: LLM, monkeypatch):
        LLM(model=MODEL_NAME,
            max_num_batched_tokens=4096,
            tensor_parallel_size=1,
-            gpu_memory_utilization=0.10,
+            gpu_memory_utilization=0.20,
            enforce_eager=True)
    finally:
        # Reset the environment after the test