[V0 Deprecation] Remove unused swap_space parameter (#36216)

Signed-off-by: majiayu000 <1835304752@qq.com> Co-authored-by: mcelrath
2026-03-07 22:09:55 +08:00
parent ee8a29511f
commit 00b814ba5a
22 changed files with 19 additions and 79 deletions
--- a/vllm/config/cache.py
+++ b/vllm/config/cache.py
@@ -1,21 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-import math
 from dataclasses import field
-from typing import TYPE_CHECKING, Any, Literal
+from typing import Literal

 from pydantic import Field, SkipValidation, field_validator

 from vllm.config.utils import config
 from vllm.logger import init_logger
-from vllm.utils.mem_constants import GiB_bytes
-from vllm.utils.mem_utils import format_gib, get_cpu_memory
-
-if TYPE_CHECKING:
-    from vllm.config.parallel import ParallelConfig
-else:
-    ParallelConfig = Any

 logger = init_logger(__name__)

@@ -53,8 +45,6 @@ class CacheConfig:
    not matter if you have another vLLM instance running on the same GPU. For
    example, if you have two vLLM instances running on the same GPU, you can
    set the GPU memory utilization to 0.5 for each instance."""
-    swap_space: float = Field(default=4, ge=0)
-    """Size of the CPU swap space per GPU (in GiB)."""
    cache_dtype: CacheDType = "auto"
    """Data type for kv cache storage. If "auto", will use model data type.
    CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ROCm (AMD GPU) supports
@@ -173,7 +163,6 @@ class CacheConfig:
        ignored_factors = {
            # Runtime/derived knobs that don't affect compiled graph shape
            "gpu_memory_utilization",
-            "swap_space",
            "is_attention_free",
            "num_gpu_blocks_override",
            "enable_prefix_caching",
@@ -208,24 +197,3 @@ class CacheConfig:
                "scaling factor."
            )
        return cache_dtype
-
-    def verify_with_parallel_config(
-        self,
-        parallel_config: ParallelConfig,
-    ) -> None:
-        swap_space_bytes = math.ceil(self.swap_space * GiB_bytes)
-        total_cpu_memory = get_cpu_memory()
-        # FIXME(woosuk): Here, it is assumed that the GPUs in a tensor parallel
-        # group are in the same node. However, the GPUs may span multiple nodes.
-        num_gpus_per_node = parallel_config.tensor_parallel_size
-        cpu_memory_usage = swap_space_bytes * num_gpus_per_node
-
-        msg = (
-            f"{format_gib(cpu_memory_usage)} GiB out of the "
-            f"{format_gib(total_cpu_memory)} GiB total CPU memory "
-            "is allocated for the swap space."
-        )
-        if cpu_memory_usage > 0.7 * total_cpu_memory:
-            raise ValueError("Too large swap space. " + msg)
-        elif cpu_memory_usage > 0.4 * total_cpu_memory:
-            logger.warning("Possibly too large swap space. %s", msg)
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -674,8 +674,6 @@ class VllmConfig:

            self.parallel_config.is_moe_model = self.model_config.is_moe

-        self.cache_config.verify_with_parallel_config(self.parallel_config)
-
        if self.lora_config is not None:
            self.lora_config.verify_with_model_config(self.model_config)