[Feature] Support CPU Offloading without Pytorch Pinned Memory that leads to doubled allocation (#32993)

Signed-off-by: wzhao18 <wzhao18.sz@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
This commit is contained in:
Wei Zhao
2026-02-13 11:11:26 -05:00
committed by GitHub
parent 4a9952ec1b
commit 59d53066d8
6 changed files with 127 additions and 62 deletions

View File

@@ -1,10 +1,29 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from ..utils import compare_two_settings
def test_cpu_offload():
@pytest.mark.parametrize("disable_pin_memory", [False, True])
@pytest.mark.parametrize("disable_uva", [False, True])
def test_cpu_offload(disable_pin_memory, disable_uva):
env_vars = {
"VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY": str(int(disable_pin_memory)),
"VLLM_WEIGHT_OFFLOADING_DISABLE_UVA": str(int(disable_uva)),
}
args = ["--cpu-offload-gb", "1"]
# cuda graph only works with UVA offloading
if disable_uva:
args.append("--enforce-eager")
compare_two_settings(
"hmellor/tiny-random-LlamaForCausalLM", [], ["--cpu-offload-gb", "1"]
model="hmellor/tiny-random-LlamaForCausalLM",
arg1=[],
arg2=args,
env1=None,
env2=env_vars,
)