[Feature] Support CPU Offloading without Pytorch Pinned Memory that leads to doubled allocation (#32993)

Signed-off-by: wzhao18 <wzhao18.sz@gmail.com> Co-authored-by: Michael Goin <mgoin64@gmail.com>
2026-02-13 11:11:26 -05:00
parent 4a9952ec1b
commit 59d53066d8
6 changed files with 127 additions and 62 deletions
--- a/vllm/utils/torch_utils.py
+++ b/vllm/utils/torch_utils.py
@@ -678,12 +678,18 @@ def get_accelerator_view_from_cpu_tensor(cpu_tensor: torch.Tensor) -> torch.Tens
    """
    Get an accelerator view of a CPU tensor using Unified Virtual Addressing (UVA).
    """
-    assert cpu_tensor.is_pinned(), "CPU tensor must be pinned"
    from vllm.platforms import current_platform

    if current_platform.is_xpu():
+        assert cpu_tensor.is_pinned(), "CPU tensor must be pinned"
        return torch.ops._C.get_xpu_view_from_cpu_tensor(cpu_tensor)
-    return torch.ops._C.get_cuda_view_from_cpu_tensor(cpu_tensor)
+    elif current_platform.is_cuda():
+        return torch.ops._C.get_cuda_view_from_cpu_tensor(cpu_tensor)
+    else:
+        raise ValueError(
+            f"`get_accelerator_view_from_cpu_tensor` is currently "
+            f"not supported in: {current_platform.device_name}"
+        )


 # Helper function used in testing.