[Bugfix][Core] Use torch.cuda.memory_stats() to profile peak memory usage (#9352)

Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
This commit is contained in:
Joe Runde
2024-10-17 21:47:27 -05:00
committed by GitHub
parent 48138a8415
commit de4008e2ab
4 changed files with 122 additions and 17 deletions

View File

@@ -26,10 +26,12 @@ def test_lazy_outlines(sample_regex):
# make sure outlines is not imported
assert 'outlines' not in sys.modules
# The second LLM needs to request a higher gpu_memory_utilization because
# the first LLM has already allocated a full 30% of the gpu memory.
llm = LLM(model="facebook/opt-125m",
enforce_eager=True,
guided_decoding_backend="lm-format-enforcer",
gpu_memory_utilization=0.3)
gpu_memory_utilization=0.6)
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
outputs = llm.generate(
prompts=[