[Feature] Support CPU Offloading without Pytorch Pinned Memory that leads to doubled allocation (#32993)
Signed-off-by: wzhao18 <wzhao18.sz@gmail.com> Co-authored-by: Michael Goin <mgoin64@gmail.com>
This commit is contained in:
@@ -678,12 +678,18 @@ def get_accelerator_view_from_cpu_tensor(cpu_tensor: torch.Tensor) -> torch.Tens
|
||||
"""
|
||||
Get an accelerator view of a CPU tensor using Unified Virtual Addressing (UVA).
|
||||
"""
|
||||
assert cpu_tensor.is_pinned(), "CPU tensor must be pinned"
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
if current_platform.is_xpu():
|
||||
assert cpu_tensor.is_pinned(), "CPU tensor must be pinned"
|
||||
return torch.ops._C.get_xpu_view_from_cpu_tensor(cpu_tensor)
|
||||
return torch.ops._C.get_cuda_view_from_cpu_tensor(cpu_tensor)
|
||||
elif current_platform.is_cuda():
|
||||
return torch.ops._C.get_cuda_view_from_cpu_tensor(cpu_tensor)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"`get_accelerator_view_from_cpu_tensor` is currently "
|
||||
f"not supported in: {current_platform.device_name}"
|
||||
)
|
||||
|
||||
|
||||
# Helper function used in testing.
|
||||
|
||||
Reference in New Issue
Block a user