Signed-off-by: wzhao18 <wzhao18.sz@gmail.com> Co-authored-by: Michael Goin <mgoin64@gmail.com>
30 lines
824 B
Python
30 lines
824 B
Python
# SPDX-License-Identifier: Apache-2.0
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
|
|
import pytest
|
|
|
|
from ..utils import compare_two_settings
|
|
|
|
|
|
@pytest.mark.parametrize("disable_pin_memory", [False, True])
|
|
@pytest.mark.parametrize("disable_uva", [False, True])
|
|
def test_cpu_offload(disable_pin_memory, disable_uva):
|
|
env_vars = {
|
|
"VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY": str(int(disable_pin_memory)),
|
|
"VLLM_WEIGHT_OFFLOADING_DISABLE_UVA": str(int(disable_uva)),
|
|
}
|
|
|
|
args = ["--cpu-offload-gb", "1"]
|
|
|
|
# cuda graph only works with UVA offloading
|
|
if disable_uva:
|
|
args.append("--enforce-eager")
|
|
|
|
compare_two_settings(
|
|
model="hmellor/tiny-random-LlamaForCausalLM",
|
|
arg1=[],
|
|
arg2=args,
|
|
env1=None,
|
|
env2=env_vars,
|
|
)
|