[V1] Fully Transparent Implementation of CPU Offloading (#15354)

Signed-off-by: youkaichao <youkaichao@gmail.com>
This commit is contained in:
youkaichao
2025-03-31 20:22:34 +08:00
committed by GitHub
parent e7ae3bf3d6
commit 555aa21905
12 changed files with 148 additions and 25 deletions

View File

@@ -69,6 +69,10 @@ class GPUModelRunner(LoRAModelRunnerMixin):
self.prompt_adapter_config = vllm_config.prompt_adapter_config
self.observability_config = vllm_config.observability_config
from vllm.model_executor.models.utils import set_cpu_offload_max_bytes
set_cpu_offload_max_bytes(
int(self.cache_config.cpu_offload_gb * 1024**3))
model_config = self.model_config
cache_config = self.cache_config
scheduler_config = self.scheduler_config