From b37b679770aade27f33d20c93bf467c6a7fba65d Mon Sep 17 00:00:00 2001 From: Wei Zhao <51183510+wzhao18@users.noreply.github.com> Date: Fri, 13 Feb 2026 23:02:24 -0500 Subject: [PATCH] [Feature][Perf] Support Selective CPU Weight Offloading (#34535) Signed-off-by: wzhao18 --- vllm/config/cache.py | 11 +++++++++++ vllm/engine/arg_utils.py | 5 +++++ vllm/model_executor/models/utils.py | 24 +++++++++++++++++++++++- vllm/v1/worker/gpu_model_runner.py | 6 +++++- 4 files changed, 44 insertions(+), 2 deletions(-) diff --git a/vllm/config/cache.py b/vllm/config/cache.py index bf121e544..149b0b9b7 100644 --- a/vllm/config/cache.py +++ b/vllm/config/cache.py @@ -101,6 +101,17 @@ class CacheConfig: Note that this requires fast CPU-GPU interconnect, as part of the model is loaded from CPU memory to GPU memory on the fly in each model forward pass. """ + cpu_offload_params: set[str] = Field(default_factory=set) + """ The set of parameter name segments to target for CPU offloading. + Unmatched parameters are not offloaded. If this set is empty, parameters + are offloaded non-selectively until the memory limit defined by + `cpu_offload_gb` is reached. + Examples: + - For parameter name "mlp.experts.w2_weight": + - "experts" or "experts.w2_weight" will match. + - "expert" or "w2" will NOT match (must be exact segments). + This allows distinguishing parameters like "w2_weight" and "w2_weight_scale". + """ calculate_kv_scales: bool = False """This enables dynamic calculation of `k_scale` and `v_scale` when kv_cache_dtype is fp8. If `False`, the scales will be loaded from the model diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 84176e207..feb9d1bc8 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -434,6 +434,7 @@ class EngineArgs: disable_cascade_attn: bool = ModelConfig.disable_cascade_attn swap_space: float = CacheConfig.swap_space cpu_offload_gb: float = CacheConfig.cpu_offload_gb + cpu_offload_params: set[str] = get_field(CacheConfig, "cpu_offload_params") gpu_memory_utilization: float = CacheConfig.gpu_memory_utilization kv_cache_memory_bytes: int | None = CacheConfig.kv_cache_memory_bytes max_num_batched_tokens: int | None = None @@ -942,6 +943,9 @@ class EngineArgs: "--prefix-caching-hash-algo", **cache_kwargs["prefix_caching_hash_algo"] ) cache_group.add_argument("--cpu-offload-gb", **cache_kwargs["cpu_offload_gb"]) + cache_group.add_argument( + "--cpu-offload-params", **cache_kwargs["cpu_offload_params"] + ) cache_group.add_argument( "--calculate-kv-scales", **cache_kwargs["calculate_kv_scales"] ) @@ -1453,6 +1457,7 @@ class EngineArgs: enable_prefix_caching=self.enable_prefix_caching, prefix_caching_hash_algo=self.prefix_caching_hash_algo, cpu_offload_gb=self.cpu_offload_gb, + cpu_offload_params=self.cpu_offload_params, calculate_kv_scales=self.calculate_kv_scales, kv_sharing_fast_prefill=self.kv_sharing_fast_prefill, mamba_cache_dtype=self.mamba_cache_dtype, diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index c942178d0..658742489 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -31,6 +31,7 @@ from vllm.model_executor.models.interfaces import supports_any_eagle from vllm.multimodal import NestedTensors from vllm.sequence import IntermediateTensors from vllm.utils.math_utils import cdiv +from vllm.utils.mem_utils import format_gib from vllm.utils.platform_utils import ( is_pin_memory_available, is_uva_available, @@ -613,6 +614,7 @@ class PPMissingLayer(torch.nn.Identity): _CPU_OFFLOAD_BYTES = 0 _CPU_OFFLOAD_MAX_BYTES = 0 +_CPU_OFFLOAD_PARAMS = set() def set_cpu_offload_max_bytes(max_bytes: int) -> None: @@ -621,6 +623,11 @@ def set_cpu_offload_max_bytes(max_bytes: int) -> None: _CPU_OFFLOAD_MAX_BYTES = max_bytes +def set_cpu_offload_params(params: set[str]) -> None: + global _CPU_OFFLOAD_PARAMS + _CPU_OFFLOAD_PARAMS = params + + def maybe_offload_to_cpu(module: torch.nn.Module) -> torch.nn.Module: if (params := next(module.parameters(), None)) is None: return module @@ -642,12 +649,23 @@ def maybe_offload_to_cpu(module: torch.nn.Module) -> torch.nn.Module: # offload parameters to CPU # use pin_memory if possible, which helps cudagraph capture speed offloaded_parameters = False - for p in module.parameters(): + for name, p in module.named_parameters(): if _CPU_OFFLOAD_BYTES >= _CPU_OFFLOAD_MAX_BYTES: # we use per-parameter offloading # one module might have some parameters offloaded and some not break + if _CPU_OFFLOAD_PARAMS: + # Check if parameter belongs to the offloading set + # Add dots here to ensure we match full segments only + # e.g., "experts.w2_weight" matches "mlp.experts.w2_weight" but not + # "mlp.experts.w2_weight_scale" + should_offload = any( + f".{param}." in f".{name}." for param in _CPU_OFFLOAD_PARAMS + ) + if not should_offload: + continue + cpu_data = p.data.to(device="cpu") if pin_memory: cpu_data = cpu_data.pin_memory() @@ -708,6 +726,10 @@ def make_layers( ] + [PPMissingLayer() for _ in range(end_layer, num_hidden_layers)] ) + if _CPU_OFFLOAD_MAX_BYTES > 0: + logger.info( + "Total CPU offloaded parameters: %s GBs", format_gib(_CPU_OFFLOAD_BYTES) + ) return start_layer, end_layer, modules diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index c9fc056be..41ec06230 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -345,9 +345,13 @@ class GPUModelRunner( self.speculative_config = vllm_config.speculative_config self.observability_config = vllm_config.observability_config - from vllm.model_executor.models.utils import set_cpu_offload_max_bytes + from vllm.model_executor.models.utils import ( + set_cpu_offload_max_bytes, + set_cpu_offload_params, + ) set_cpu_offload_max_bytes(int(self.cache_config.cpu_offload_gb * 1024**3)) + set_cpu_offload_params(self.cache_config.cpu_offload_params) model_config = self.model_config cache_config = self.cache_config