[Feature][Perf] Support Selective CPU Weight Offloading (#34535)
Signed-off-by: wzhao18 <wzhao18.sz@gmail.com>
This commit is contained in:
@@ -101,6 +101,17 @@ class CacheConfig:
|
|||||||
Note that this requires fast CPU-GPU interconnect, as part of the model is
|
Note that this requires fast CPU-GPU interconnect, as part of the model is
|
||||||
loaded from CPU memory to GPU memory on the fly in each model forward pass.
|
loaded from CPU memory to GPU memory on the fly in each model forward pass.
|
||||||
"""
|
"""
|
||||||
|
cpu_offload_params: set[str] = Field(default_factory=set)
|
||||||
|
""" The set of parameter name segments to target for CPU offloading.
|
||||||
|
Unmatched parameters are not offloaded. If this set is empty, parameters
|
||||||
|
are offloaded non-selectively until the memory limit defined by
|
||||||
|
`cpu_offload_gb` is reached.
|
||||||
|
Examples:
|
||||||
|
- For parameter name "mlp.experts.w2_weight":
|
||||||
|
- "experts" or "experts.w2_weight" will match.
|
||||||
|
- "expert" or "w2" will NOT match (must be exact segments).
|
||||||
|
This allows distinguishing parameters like "w2_weight" and "w2_weight_scale".
|
||||||
|
"""
|
||||||
calculate_kv_scales: bool = False
|
calculate_kv_scales: bool = False
|
||||||
"""This enables dynamic calculation of `k_scale` and `v_scale` when
|
"""This enables dynamic calculation of `k_scale` and `v_scale` when
|
||||||
kv_cache_dtype is fp8. If `False`, the scales will be loaded from the model
|
kv_cache_dtype is fp8. If `False`, the scales will be loaded from the model
|
||||||
|
|||||||
@@ -434,6 +434,7 @@ class EngineArgs:
|
|||||||
disable_cascade_attn: bool = ModelConfig.disable_cascade_attn
|
disable_cascade_attn: bool = ModelConfig.disable_cascade_attn
|
||||||
swap_space: float = CacheConfig.swap_space
|
swap_space: float = CacheConfig.swap_space
|
||||||
cpu_offload_gb: float = CacheConfig.cpu_offload_gb
|
cpu_offload_gb: float = CacheConfig.cpu_offload_gb
|
||||||
|
cpu_offload_params: set[str] = get_field(CacheConfig, "cpu_offload_params")
|
||||||
gpu_memory_utilization: float = CacheConfig.gpu_memory_utilization
|
gpu_memory_utilization: float = CacheConfig.gpu_memory_utilization
|
||||||
kv_cache_memory_bytes: int | None = CacheConfig.kv_cache_memory_bytes
|
kv_cache_memory_bytes: int | None = CacheConfig.kv_cache_memory_bytes
|
||||||
max_num_batched_tokens: int | None = None
|
max_num_batched_tokens: int | None = None
|
||||||
@@ -942,6 +943,9 @@ class EngineArgs:
|
|||||||
"--prefix-caching-hash-algo", **cache_kwargs["prefix_caching_hash_algo"]
|
"--prefix-caching-hash-algo", **cache_kwargs["prefix_caching_hash_algo"]
|
||||||
)
|
)
|
||||||
cache_group.add_argument("--cpu-offload-gb", **cache_kwargs["cpu_offload_gb"])
|
cache_group.add_argument("--cpu-offload-gb", **cache_kwargs["cpu_offload_gb"])
|
||||||
|
cache_group.add_argument(
|
||||||
|
"--cpu-offload-params", **cache_kwargs["cpu_offload_params"]
|
||||||
|
)
|
||||||
cache_group.add_argument(
|
cache_group.add_argument(
|
||||||
"--calculate-kv-scales", **cache_kwargs["calculate_kv_scales"]
|
"--calculate-kv-scales", **cache_kwargs["calculate_kv_scales"]
|
||||||
)
|
)
|
||||||
@@ -1453,6 +1457,7 @@ class EngineArgs:
|
|||||||
enable_prefix_caching=self.enable_prefix_caching,
|
enable_prefix_caching=self.enable_prefix_caching,
|
||||||
prefix_caching_hash_algo=self.prefix_caching_hash_algo,
|
prefix_caching_hash_algo=self.prefix_caching_hash_algo,
|
||||||
cpu_offload_gb=self.cpu_offload_gb,
|
cpu_offload_gb=self.cpu_offload_gb,
|
||||||
|
cpu_offload_params=self.cpu_offload_params,
|
||||||
calculate_kv_scales=self.calculate_kv_scales,
|
calculate_kv_scales=self.calculate_kv_scales,
|
||||||
kv_sharing_fast_prefill=self.kv_sharing_fast_prefill,
|
kv_sharing_fast_prefill=self.kv_sharing_fast_prefill,
|
||||||
mamba_cache_dtype=self.mamba_cache_dtype,
|
mamba_cache_dtype=self.mamba_cache_dtype,
|
||||||
|
|||||||
@@ -31,6 +31,7 @@ from vllm.model_executor.models.interfaces import supports_any_eagle
|
|||||||
from vllm.multimodal import NestedTensors
|
from vllm.multimodal import NestedTensors
|
||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
from vllm.utils.math_utils import cdiv
|
from vllm.utils.math_utils import cdiv
|
||||||
|
from vllm.utils.mem_utils import format_gib
|
||||||
from vllm.utils.platform_utils import (
|
from vllm.utils.platform_utils import (
|
||||||
is_pin_memory_available,
|
is_pin_memory_available,
|
||||||
is_uva_available,
|
is_uva_available,
|
||||||
@@ -613,6 +614,7 @@ class PPMissingLayer(torch.nn.Identity):
|
|||||||
|
|
||||||
_CPU_OFFLOAD_BYTES = 0
|
_CPU_OFFLOAD_BYTES = 0
|
||||||
_CPU_OFFLOAD_MAX_BYTES = 0
|
_CPU_OFFLOAD_MAX_BYTES = 0
|
||||||
|
_CPU_OFFLOAD_PARAMS = set()
|
||||||
|
|
||||||
|
|
||||||
def set_cpu_offload_max_bytes(max_bytes: int) -> None:
|
def set_cpu_offload_max_bytes(max_bytes: int) -> None:
|
||||||
@@ -621,6 +623,11 @@ def set_cpu_offload_max_bytes(max_bytes: int) -> None:
|
|||||||
_CPU_OFFLOAD_MAX_BYTES = max_bytes
|
_CPU_OFFLOAD_MAX_BYTES = max_bytes
|
||||||
|
|
||||||
|
|
||||||
|
def set_cpu_offload_params(params: set[str]) -> None:
|
||||||
|
global _CPU_OFFLOAD_PARAMS
|
||||||
|
_CPU_OFFLOAD_PARAMS = params
|
||||||
|
|
||||||
|
|
||||||
def maybe_offload_to_cpu(module: torch.nn.Module) -> torch.nn.Module:
|
def maybe_offload_to_cpu(module: torch.nn.Module) -> torch.nn.Module:
|
||||||
if (params := next(module.parameters(), None)) is None:
|
if (params := next(module.parameters(), None)) is None:
|
||||||
return module
|
return module
|
||||||
@@ -642,12 +649,23 @@ def maybe_offload_to_cpu(module: torch.nn.Module) -> torch.nn.Module:
|
|||||||
# offload parameters to CPU
|
# offload parameters to CPU
|
||||||
# use pin_memory if possible, which helps cudagraph capture speed
|
# use pin_memory if possible, which helps cudagraph capture speed
|
||||||
offloaded_parameters = False
|
offloaded_parameters = False
|
||||||
for p in module.parameters():
|
for name, p in module.named_parameters():
|
||||||
if _CPU_OFFLOAD_BYTES >= _CPU_OFFLOAD_MAX_BYTES:
|
if _CPU_OFFLOAD_BYTES >= _CPU_OFFLOAD_MAX_BYTES:
|
||||||
# we use per-parameter offloading
|
# we use per-parameter offloading
|
||||||
# one module might have some parameters offloaded and some not
|
# one module might have some parameters offloaded and some not
|
||||||
break
|
break
|
||||||
|
|
||||||
|
if _CPU_OFFLOAD_PARAMS:
|
||||||
|
# Check if parameter belongs to the offloading set
|
||||||
|
# Add dots here to ensure we match full segments only
|
||||||
|
# e.g., "experts.w2_weight" matches "mlp.experts.w2_weight" but not
|
||||||
|
# "mlp.experts.w2_weight_scale"
|
||||||
|
should_offload = any(
|
||||||
|
f".{param}." in f".{name}." for param in _CPU_OFFLOAD_PARAMS
|
||||||
|
)
|
||||||
|
if not should_offload:
|
||||||
|
continue
|
||||||
|
|
||||||
cpu_data = p.data.to(device="cpu")
|
cpu_data = p.data.to(device="cpu")
|
||||||
if pin_memory:
|
if pin_memory:
|
||||||
cpu_data = cpu_data.pin_memory()
|
cpu_data = cpu_data.pin_memory()
|
||||||
@@ -708,6 +726,10 @@ def make_layers(
|
|||||||
]
|
]
|
||||||
+ [PPMissingLayer() for _ in range(end_layer, num_hidden_layers)]
|
+ [PPMissingLayer() for _ in range(end_layer, num_hidden_layers)]
|
||||||
)
|
)
|
||||||
|
if _CPU_OFFLOAD_MAX_BYTES > 0:
|
||||||
|
logger.info(
|
||||||
|
"Total CPU offloaded parameters: %s GBs", format_gib(_CPU_OFFLOAD_BYTES)
|
||||||
|
)
|
||||||
return start_layer, end_layer, modules
|
return start_layer, end_layer, modules
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -345,9 +345,13 @@ class GPUModelRunner(
|
|||||||
self.speculative_config = vllm_config.speculative_config
|
self.speculative_config = vllm_config.speculative_config
|
||||||
self.observability_config = vllm_config.observability_config
|
self.observability_config = vllm_config.observability_config
|
||||||
|
|
||||||
from vllm.model_executor.models.utils import set_cpu_offload_max_bytes
|
from vllm.model_executor.models.utils import (
|
||||||
|
set_cpu_offload_max_bytes,
|
||||||
|
set_cpu_offload_params,
|
||||||
|
)
|
||||||
|
|
||||||
set_cpu_offload_max_bytes(int(self.cache_config.cpu_offload_gb * 1024**3))
|
set_cpu_offload_max_bytes(int(self.cache_config.cpu_offload_gb * 1024**3))
|
||||||
|
set_cpu_offload_params(self.cache_config.cpu_offload_params)
|
||||||
|
|
||||||
model_config = self.model_config
|
model_config = self.model_config
|
||||||
cache_config = self.cache_config
|
cache_config = self.cache_config
|
||||||
|
|||||||
Reference in New Issue
Block a user