[Feature] Support CPU Offloading without Pytorch Pinned Memory that leads to doubled allocation (#32993)

Signed-off-by: wzhao18 <wzhao18.sz@gmail.com> Co-authored-by: Michael Goin <mgoin64@gmail.com>
2026-02-13 11:11:26 -05:00
parent 4a9952ec1b
commit 59d53066d8
6 changed files with 127 additions and 62 deletions
--- a/tests/basic_correctness/test_cpu_offload.py
+++ b/tests/basic_correctness/test_cpu_offload.py
@@ -1,10 +1,29 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

+import pytest
+
 from ..utils import compare_two_settings


-def test_cpu_offload():
+@pytest.mark.parametrize("disable_pin_memory", [False, True])
+@pytest.mark.parametrize("disable_uva", [False, True])
+def test_cpu_offload(disable_pin_memory, disable_uva):
+    env_vars = {
+        "VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY": str(int(disable_pin_memory)),
+        "VLLM_WEIGHT_OFFLOADING_DISABLE_UVA": str(int(disable_uva)),
+    }
+
+    args = ["--cpu-offload-gb", "1"]
+
+    # cuda graph only works with UVA offloading
+    if disable_uva:
+        args.append("--enforce-eager")
+
    compare_two_settings(
-        "hmellor/tiny-random-LlamaForCausalLM", [], ["--cpu-offload-gb", "1"]
+        model="hmellor/tiny-random-LlamaForCausalLM",
+        arg1=[],
+        arg2=args,
+        env1=None,
+        env2=env_vars,
    )