[Feature] Support CPU Offloading without Pytorch Pinned Memory that leads to doubled allocation (#32993)

Signed-off-by: wzhao18 <wzhao18.sz@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
This commit is contained in:
Wei Zhao
2026-02-13 11:11:26 -05:00
committed by GitHub
parent 4a9952ec1b
commit 59d53066d8
6 changed files with 127 additions and 62 deletions

View File

@@ -678,12 +678,18 @@ def get_accelerator_view_from_cpu_tensor(cpu_tensor: torch.Tensor) -> torch.Tens
"""
Get an accelerator view of a CPU tensor using Unified Virtual Addressing (UVA).
"""
assert cpu_tensor.is_pinned(), "CPU tensor must be pinned"
from vllm.platforms import current_platform
if current_platform.is_xpu():
assert cpu_tensor.is_pinned(), "CPU tensor must be pinned"
return torch.ops._C.get_xpu_view_from_cpu_tensor(cpu_tensor)
return torch.ops._C.get_cuda_view_from_cpu_tensor(cpu_tensor)
elif current_platform.is_cuda():
return torch.ops._C.get_cuda_view_from_cpu_tensor(cpu_tensor)
else:
raise ValueError(
f"`get_accelerator_view_from_cpu_tensor` is currently "
f"not supported in: {current_platform.device_name}"
)
# Helper function used in testing.