diff --git a/csrc/cuda_view.cu b/csrc/cuda_view.cu index 9853fc942..73b368cb6 100644 --- a/csrc/cuda_view.cu +++ b/csrc/cuda_view.cu @@ -2,33 +2,58 @@ #include #include -// This function assumes that `cpu_tensor` is a CPU tensor allocated with pinned -// memory, and that UVA (Unified Virtual Addressing) is enabled. +// This function assumes that `cpu_tensor` is a CPU tensor, +// and that UVA (Unified Virtual Addressing) is enabled. torch::Tensor get_cuda_view_from_cpu_tensor(torch::Tensor& cpu_tensor) { TORCH_CHECK(cpu_tensor.device().is_cpu(), "Input tensor must be on CPU"); - // Get raw host pointer from CPU tensor - void* host_ptr = cpu_tensor.data_ptr(); + // handle empty tensor + if (cpu_tensor.numel() == 0) { + return torch::empty(cpu_tensor.sizes(), + cpu_tensor.options().device(torch::kCUDA)); + } + + if (cpu_tensor.is_pinned()) { + // If CPU tensor is pinned, directly get the device pointer. + void* host_ptr = const_cast(cpu_tensor.data_ptr()); + void* device_ptr = nullptr; + cudaError_t err = cudaHostGetDevicePointer(&device_ptr, host_ptr, 0); + TORCH_CHECK(err == cudaSuccess, + "cudaHostGetDevicePointer failed: ", cudaGetErrorString(err)); + + return torch::from_blob( + device_ptr, cpu_tensor.sizes(), cpu_tensor.strides(), + [base = cpu_tensor](void*) {}, // keep cpu tensor alive + cpu_tensor.options().device(torch::kCUDA)); + } + + // If CPU tensor is not pinned, allocate a new pinned memory buffer. + torch::Tensor contiguous_cpu = cpu_tensor.contiguous(); + size_t nbytes = contiguous_cpu.nbytes(); + + void* host_ptr = nullptr; + cudaError_t err = cudaHostAlloc(&host_ptr, nbytes, cudaHostAllocMapped); + if (err != cudaSuccess) { + AT_ERROR("cudaHostAlloc failed: ", cudaGetErrorString(err)); + } + + err = cudaMemcpy(host_ptr, contiguous_cpu.data_ptr(), nbytes, + cudaMemcpyDefault); + if (err != cudaSuccess) { + cudaFreeHost(host_ptr); + AT_ERROR("cudaMemcpy failed: ", cudaGetErrorString(err)); + } - // Get a device pointer corresponding to the pinned host memory void* device_ptr = nullptr; - cudaError_t err = cudaHostGetDevicePointer(&device_ptr, host_ptr, 0); - TORCH_CHECK(err == cudaSuccess, - "cudaHostGetDevicePointer failed: ", cudaGetErrorString(err)); + err = cudaHostGetDevicePointer(&device_ptr, host_ptr, 0); + if (err != cudaSuccess) { + cudaFreeHost(host_ptr); + AT_ERROR("cudaHostGetDevicePointer failed: ", cudaGetErrorString(err)); + } - // We'll use the same sizes, strides, and dtype as the CPU tensor. - // TODO: check if layout is respected. - auto sizes = cpu_tensor.sizes(); - auto strides = cpu_tensor.strides(); - auto options = cpu_tensor.options().device(torch::kCUDA); + auto deleter = [host_ptr](void*) { cudaFreeHost(host_ptr); }; - // use default no-op deleter, since the memory is owned by the original CPU - // tensor - torch::Tensor cuda_tensor = - torch::from_blob(device_ptr, sizes, strides, options); - - TORCH_CHECK(cuda_tensor.device().is_cuda(), - "Resulting tensor is not on CUDA device"); - - return cuda_tensor; -} + return torch::from_blob(device_ptr, contiguous_cpu.sizes(), + contiguous_cpu.strides(), deleter, + contiguous_cpu.options().device(torch::kCUDA)); +} \ No newline at end of file diff --git a/tests/basic_correctness/test_cpu_offload.py b/tests/basic_correctness/test_cpu_offload.py index 89839372c..c1df36b36 100644 --- a/tests/basic_correctness/test_cpu_offload.py +++ b/tests/basic_correctness/test_cpu_offload.py @@ -1,10 +1,29 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import pytest + from ..utils import compare_two_settings -def test_cpu_offload(): +@pytest.mark.parametrize("disable_pin_memory", [False, True]) +@pytest.mark.parametrize("disable_uva", [False, True]) +def test_cpu_offload(disable_pin_memory, disable_uva): + env_vars = { + "VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY": str(int(disable_pin_memory)), + "VLLM_WEIGHT_OFFLOADING_DISABLE_UVA": str(int(disable_uva)), + } + + args = ["--cpu-offload-gb", "1"] + + # cuda graph only works with UVA offloading + if disable_uva: + args.append("--enforce-eager") + compare_two_settings( - "hmellor/tiny-random-LlamaForCausalLM", [], ["--cpu-offload-gb", "1"] + model="hmellor/tiny-random-LlamaForCausalLM", + arg1=[], + arg2=args, + env1=None, + env2=env_vars, ) diff --git a/vllm/envs.py b/vllm/envs.py index 039b3239c..674c1cde2 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -230,6 +230,8 @@ if TYPE_CHECKING: VLLM_USE_V2_MODEL_RUNNER: bool = False VLLM_LOG_MODEL_INSPECTION: bool = False VLLM_DEBUG_MFU_METRICS: bool = False + VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY: bool = False + VLLM_WEIGHT_OFFLOADING_DISABLE_UVA: bool = False VLLM_DISABLE_LOG_LOGO: bool = False VLLM_LORA_DISABLE_PDL: bool = False @@ -1542,6 +1544,14 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_DEBUG_MFU_METRICS": lambda: bool( int(os.getenv("VLLM_DEBUG_MFU_METRICS", "0")) ), + # Disable using pytorch's pin memory for CPU offloading. + "VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY": lambda: bool( + int(os.getenv("VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY", "0")) + ), + # Disable using UVA (Unified Virtual Addressing) for CPU offloading. + "VLLM_WEIGHT_OFFLOADING_DISABLE_UVA": lambda: bool( + int(os.getenv("VLLM_WEIGHT_OFFLOADING_DISABLE_UVA", "0")) + ), # Disable logging of vLLM logo at server startup time. "VLLM_DISABLE_LOG_LOGO": lambda: bool(int(os.getenv("VLLM_DISABLE_LOG_LOGO", "0"))), # Disable PDL for LoRA, as enabling PDL with LoRA on SM100 causes diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py index 51f62c15b..dc525c454 100644 --- a/vllm/model_executor/model_loader/utils.py +++ b/vllm/model_executor/model_loader/utils.py @@ -11,6 +11,7 @@ import torch from torch import nn from typing_extensions import assert_never +import vllm.envs as envs from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config from vllm.logger import init_logger from vllm.model_executor.layers.attention import Attention, MLAAttention @@ -25,6 +26,7 @@ from vllm.model_executor.model_loader.reload import ( from vllm.model_executor.models.interfaces import SupportsQuant from vllm.tracing import instrument from vllm.utils.platform_utils import is_pin_memory_available +from vllm.utils.torch_utils import get_accelerator_view_from_cpu_tensor logger = init_logger(__name__) @@ -111,7 +113,8 @@ def process_weights_after_loading( ): # TODO(lucas): see if there is a way to unify the signatures # of process_weights_after_loading - module.process_weights_after_loading(model_config.dtype) + with device_loading_context(module, target_device): + module.process_weights_after_loading(model_config.dtype) # Needed for torchao model reloading via model.reload_weights # @kylesayrs @jerryzh168 this can be removed if callers move to `reload_weights` @@ -127,38 +130,41 @@ def device_loading_context(module: torch.nn.Module, target_device: torch.device) return original_device_states: dict[str, torch.device] = {} + uva_offloaded_parameters: list[str] = [] # Store original device states and move parameters to GPU if they're on CPU for name, p in module.named_parameters(): if p.device.type == "cpu": original_device_states[name] = p.device p.data = p.data.to(target_device) + if getattr(p, "_vllm_is_uva_offloaded", False): + uva_offloaded_parameters.append(name) # Parameters already on target device are not touched try: yield module finally: + use_pin_memory = ( + is_pin_memory_available() + and not envs.VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY + ) # Restore parameters to their original devices, ignoring new parameters - pin_memory = is_pin_memory_available() for name, p in module.named_parameters(): if name in original_device_states: original_device: torch.device = original_device_states[name] - if original_device.type == "cpu": - # `torch.empty_like` does not support `pin_memory` argument - cpu_data = torch.empty_strided( - size=p.data.size(), - stride=p.data.stride(), - dtype=p.data.dtype, - layout=p.data.layout, - device="cpu", - pin_memory=pin_memory, - ) - cpu_data.copy_(p.data) - p.data = cpu_data - else: - p.data = p.data.to(original_device) - # New parameters or parameters already on target device are untouched + p.data = p.data.to(original_device) + + # parameter is UVA offloaded, but was replaced with a new device tensor + # re-offload it to CPU using UVA + if name in uva_offloaded_parameters and not getattr( + p, "_vllm_is_uva_offloaded", False + ): + cpu_data = p.data.to(device="cpu") + if use_pin_memory: + cpu_data = cpu_data.pin_memory() + p.data = get_accelerator_view_from_cpu_tensor(cpu_data) + p._vllm_is_uva_offloaded = True _MODEL_ARCH_BY_HASH = dict[int, tuple[type[nn.Module], str]]() diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index c47a6248a..c942178d0 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -13,6 +13,7 @@ from torch.func import functional_call from torch.nn.modules.module import register_module_module_registration_hook from transformers import PretrainedConfig +import vllm.envs as envs from vllm.config import VllmConfig from vllm.distributed import ( get_tensor_model_parallel_rank, @@ -633,11 +634,10 @@ def maybe_offload_to_cpu(module: torch.nn.Module) -> torch.nn.Module: if _CPU_OFFLOAD_BYTES >= _CPU_OFFLOAD_MAX_BYTES: return module - pin_memory = is_pin_memory_available() - uva_available = is_uva_available() - - assert uva_available, "V1 CPU offloading requires uva (pin memory) support" - uva_offloading = True + pin_memory = ( + is_pin_memory_available() and not envs.VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY + ) + uva_offloading = is_uva_available() and not envs.VLLM_WEIGHT_OFFLOADING_DISABLE_UVA # offload parameters to CPU # use pin_memory if possible, which helps cudagraph capture speed @@ -648,22 +648,16 @@ def maybe_offload_to_cpu(module: torch.nn.Module) -> torch.nn.Module: # one module might have some parameters offloaded and some not break - # `torch.empty_like` does not support `pin_memory` argument - cpu_data = torch.empty_strided( - size=p.data.size(), - stride=p.data.stride(), - dtype=p.data.dtype, - layout=p.data.layout, - device="cpu", - pin_memory=pin_memory, - ) - cpu_data.copy_(p.data) + cpu_data = p.data.to(device="cpu") + if pin_memory: + cpu_data = cpu_data.pin_memory() + if not uva_offloading: p.data = cpu_data else: - # keep the cpu data alive - p._vllm_offloaded_cpu_data = cpu_data p.data = get_accelerator_view_from_cpu_tensor(cpu_data) + p._vllm_is_uva_offloaded = True + _CPU_OFFLOAD_BYTES += p.data.numel() * p.data.element_size() offloaded_parameters = True @@ -678,7 +672,12 @@ def maybe_offload_to_cpu(module: torch.nn.Module) -> torch.nn.Module: k: v.to(device, non_blocking=True) for k, v in module.state_dict().items() } - output = functional_call(module, device_state, args=args, kwargs=kwargs) + + # set `tie_weights=False` as tied weights in original model + # become untied when calling .to(device) individually + output = functional_call( + module, device_state, args=args, kwargs=kwargs, tie_weights=False + ) module.forward = forward return output diff --git a/vllm/utils/torch_utils.py b/vllm/utils/torch_utils.py index 0274b305e..1bff517fd 100644 --- a/vllm/utils/torch_utils.py +++ b/vllm/utils/torch_utils.py @@ -678,12 +678,18 @@ def get_accelerator_view_from_cpu_tensor(cpu_tensor: torch.Tensor) -> torch.Tens """ Get an accelerator view of a CPU tensor using Unified Virtual Addressing (UVA). """ - assert cpu_tensor.is_pinned(), "CPU tensor must be pinned" from vllm.platforms import current_platform if current_platform.is_xpu(): + assert cpu_tensor.is_pinned(), "CPU tensor must be pinned" return torch.ops._C.get_xpu_view_from_cpu_tensor(cpu_tensor) - return torch.ops._C.get_cuda_view_from_cpu_tensor(cpu_tensor) + elif current_platform.is_cuda(): + return torch.ops._C.get_cuda_view_from_cpu_tensor(cpu_tensor) + else: + raise ValueError( + f"`get_accelerator_view_from_cpu_tensor` is currently " + f"not supported in: {current_platform.device_name}" + ) # Helper function used in testing.