[Bugfix][Core] Use torch.cuda.memory_stats() to profile peak memory usage (#9352)
Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
This commit is contained in:
@@ -26,10 +26,12 @@ def test_lazy_outlines(sample_regex):
|
||||
# make sure outlines is not imported
|
||||
assert 'outlines' not in sys.modules
|
||||
|
||||
# The second LLM needs to request a higher gpu_memory_utilization because
|
||||
# the first LLM has already allocated a full 30% of the gpu memory.
|
||||
llm = LLM(model="facebook/opt-125m",
|
||||
enforce_eager=True,
|
||||
guided_decoding_backend="lm-format-enforcer",
|
||||
gpu_memory_utilization=0.3)
|
||||
gpu_memory_utilization=0.6)
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
outputs = llm.generate(
|
||||
prompts=[
|
||||
|
||||
@@ -44,7 +44,7 @@ def test_offline_mode(llm: LLM, monkeypatch):
|
||||
LLM(model=MODEL_NAME,
|
||||
max_num_batched_tokens=4096,
|
||||
tensor_parallel_size=1,
|
||||
gpu_memory_utilization=0.10,
|
||||
gpu_memory_utilization=0.20,
|
||||
enforce_eager=True)
|
||||
finally:
|
||||
# Reset the environment after the test
|
||||
|
||||
Reference in New Issue
Block a user