[core][model] yet another cpu offload implementation (#6496)

Co-authored-by: Michael Goin <michael@neuralmagic.com>
2024-07-17 20:54:35 -07:00
parent 18fecc3559
commit 1c27d25fb5
7 changed files with 128 additions and 4 deletions
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -433,6 +433,7 @@ class CacheConfig:
        num_gpu_blocks_override: Optional[int] = None,
        sliding_window: Optional[int] = None,
        enable_prefix_caching: bool = False,
+        cpu_offload_gb: float = 0,
    ) -> None:
        self.block_size = block_size
        self.gpu_memory_utilization = gpu_memory_utilization
@@ -441,6 +442,7 @@ class CacheConfig:
        self.cache_dtype = cache_dtype
        self.sliding_window = sliding_window
        self.enable_prefix_caching = enable_prefix_caching
+        self.cpu_offload_gb = cpu_offload_gb
        self._verify_args()
        self._verify_cache_dtype()
        self._verify_prefix_caching()