2025-02-02 14:58:18 -05:00
|
|
|
# SPDX-License-Identifier: Apache-2.0
|
2025-06-03 11:20:17 -07:00
|
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
2025-02-02 14:58:18 -05:00
|
|
|
|
2026-02-13 11:11:26 -05:00
|
|
|
import pytest
|
|
|
|
|
|
2024-07-18 16:41:06 -07:00
|
|
|
from ..utils import compare_two_settings
|
|
|
|
|
|
|
|
|
|
|
2026-02-13 11:11:26 -05:00
|
|
|
@pytest.mark.parametrize("disable_pin_memory", [False, True])
|
|
|
|
|
@pytest.mark.parametrize("disable_uva", [False, True])
|
|
|
|
|
def test_cpu_offload(disable_pin_memory, disable_uva):
|
|
|
|
|
env_vars = {
|
|
|
|
|
"VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY": str(int(disable_pin_memory)),
|
|
|
|
|
"VLLM_WEIGHT_OFFLOADING_DISABLE_UVA": str(int(disable_uva)),
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
args = ["--cpu-offload-gb", "1"]
|
|
|
|
|
|
|
|
|
|
# cuda graph only works with UVA offloading
|
|
|
|
|
if disable_uva:
|
|
|
|
|
args.append("--enforce-eager")
|
|
|
|
|
|
2025-02-14 06:18:03 +00:00
|
|
|
compare_two_settings(
|
2026-02-13 11:11:26 -05:00
|
|
|
model="hmellor/tiny-random-LlamaForCausalLM",
|
|
|
|
|
arg1=[],
|
|
|
|
|
arg2=args,
|
|
|
|
|
env1=None,
|
|
|
|
|
env2=env_vars,
|
2024-10-22 01:52:14 -03:00
|
|
|
)
|