[Feature] Support CPU Offloading without Pytorch Pinned Memory that leads to doubled allocation (#32993)
Signed-off-by: wzhao18 <wzhao18.sz@gmail.com> Co-authored-by: Michael Goin <mgoin64@gmail.com>
This commit is contained in:
@@ -1,10 +1,29 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
|
||||
from ..utils import compare_two_settings
|
||||
|
||||
|
||||
def test_cpu_offload():
|
||||
@pytest.mark.parametrize("disable_pin_memory", [False, True])
|
||||
@pytest.mark.parametrize("disable_uva", [False, True])
|
||||
def test_cpu_offload(disable_pin_memory, disable_uva):
|
||||
env_vars = {
|
||||
"VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY": str(int(disable_pin_memory)),
|
||||
"VLLM_WEIGHT_OFFLOADING_DISABLE_UVA": str(int(disable_uva)),
|
||||
}
|
||||
|
||||
args = ["--cpu-offload-gb", "1"]
|
||||
|
||||
# cuda graph only works with UVA offloading
|
||||
if disable_uva:
|
||||
args.append("--enforce-eager")
|
||||
|
||||
compare_two_settings(
|
||||
"hmellor/tiny-random-LlamaForCausalLM", [], ["--cpu-offload-gb", "1"]
|
||||
model="hmellor/tiny-random-LlamaForCausalLM",
|
||||
arg1=[],
|
||||
arg2=args,
|
||||
env1=None,
|
||||
env2=env_vars,
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user