[V1] Fully Transparent Implementation of CPU Offloading (#15354)

Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-03-31 20:22:34 +08:00
parent e7ae3bf3d6
commit 555aa21905
12 changed files with 148 additions and 25 deletions
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -795,6 +795,14 @@ def is_pin_memory_available() -> bool:
    return current_platform.is_pin_memory_available()


+@cache
+def is_uva_available() -> bool:
+    """Check if Unified Virtual Addressing (UVA) is available."""
+    # UVA requires pinned memory.
+    # TODO: Add more requirements for UVA if needed.
+    return is_pin_memory_available()
+
+
 class DeviceMemoryProfiler:

    def __init__(self, device: Optional[torch.types.Device] = None):
@@ -1645,6 +1653,14 @@ def weak_ref_tensors(
    raise ValueError("Invalid type for tensors")


+def get_cuda_view_from_cpu_tensor(cpu_tensor: torch.Tensor) -> torch.Tensor:
+    """
+    Get a CUDA view of a CPU tensor using Unified Virtual Addressing (UVA).
+    """
+    assert cpu_tensor.is_pinned(), "CPU tensor must be pinned"
+    return torch.ops._C.get_cuda_view_from_cpu_tensor(cpu_tensor)
+
+
 def is_in_doc_build() -> bool:
    try:
        from sphinx.ext.autodoc.mock import _MockModule