[V0 Deprecation] Remove unused swap_space parameter (#36216)

Signed-off-by: majiayu000 <1835304752@qq.com>
Co-authored-by: mcelrath
This commit is contained in:
lif
2026-03-07 22:09:55 +08:00
committed by GitHub
parent ee8a29511f
commit 00b814ba5a
22 changed files with 19 additions and 79 deletions

View File

@@ -1,21 +1,13 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import math
from dataclasses import field
from typing import TYPE_CHECKING, Any, Literal
from typing import Literal
from pydantic import Field, SkipValidation, field_validator
from vllm.config.utils import config
from vllm.logger import init_logger
from vllm.utils.mem_constants import GiB_bytes
from vllm.utils.mem_utils import format_gib, get_cpu_memory
if TYPE_CHECKING:
from vllm.config.parallel import ParallelConfig
else:
ParallelConfig = Any
logger = init_logger(__name__)
@@ -53,8 +45,6 @@ class CacheConfig:
not matter if you have another vLLM instance running on the same GPU. For
example, if you have two vLLM instances running on the same GPU, you can
set the GPU memory utilization to 0.5 for each instance."""
swap_space: float = Field(default=4, ge=0)
"""Size of the CPU swap space per GPU (in GiB)."""
cache_dtype: CacheDType = "auto"
"""Data type for kv cache storage. If "auto", will use model data type.
CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ROCm (AMD GPU) supports
@@ -173,7 +163,6 @@ class CacheConfig:
ignored_factors = {
# Runtime/derived knobs that don't affect compiled graph shape
"gpu_memory_utilization",
"swap_space",
"is_attention_free",
"num_gpu_blocks_override",
"enable_prefix_caching",
@@ -208,24 +197,3 @@ class CacheConfig:
"scaling factor."
)
return cache_dtype
def verify_with_parallel_config(
self,
parallel_config: ParallelConfig,
) -> None:
swap_space_bytes = math.ceil(self.swap_space * GiB_bytes)
total_cpu_memory = get_cpu_memory()
# FIXME(woosuk): Here, it is assumed that the GPUs in a tensor parallel
# group are in the same node. However, the GPUs may span multiple nodes.
num_gpus_per_node = parallel_config.tensor_parallel_size
cpu_memory_usage = swap_space_bytes * num_gpus_per_node
msg = (
f"{format_gib(cpu_memory_usage)} GiB out of the "
f"{format_gib(total_cpu_memory)} GiB total CPU memory "
"is allocated for the swap space."
)
if cpu_memory_usage > 0.7 * total_cpu_memory:
raise ValueError("Too large swap space. " + msg)
elif cpu_memory_usage > 0.4 * total_cpu_memory:
logger.warning("Possibly too large swap space. %s", msg)

View File

@@ -674,8 +674,6 @@ class VllmConfig:
self.parallel_config.is_moe_model = self.model_config.is_moe
self.cache_config.verify_with_parallel_config(self.parallel_config)
if self.lora_config is not None:
self.lora_config.verify_with_model_config(self.model_config)