[V0 Deprecation] Remove unused swap_space parameter (#36216)

Signed-off-by: majiayu000 <1835304752@qq.com>
Co-authored-by: mcelrath
This commit is contained in:
lif
2026-03-07 22:09:55 +08:00
committed by GitHub
parent ee8a29511f
commit 00b814ba5a
22 changed files with 19 additions and 79 deletions

View File

@@ -83,7 +83,6 @@ We test the throughput by using `vllm bench serve` with request rate = inf to co
"server_parameters": {
"model": "meta-llama/Meta-Llama-3-8B",
"tensor_parallel_size": 1,
"swap_space": 16,
"disable_log_stats": "",
"load_format": "dummy"
},

View File

@@ -10,7 +10,6 @@
"server_parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"swap_space": 16,
"disable_log_stats": "",
"load_format": "dummy",
"max-model-len": 2048,
@@ -37,7 +36,6 @@
"server_parameters": {
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"tensor_parallel_size": 4,
"swap_space": 16,
"disable_log_stats": "",
"load_format": "dummy",
"max-model-len": 2048,
@@ -64,7 +62,6 @@
"server_parameters": {
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
"tensor_parallel_size": 2,
"swap_space": 16,
"disable_log_stats": "",
"load_format": "dummy",
"max-model-len": 2048,
@@ -91,7 +88,6 @@
"server_parameters": {
"model": "deepseek-ai/DeepSeek-R1",
"tensor_parallel_size": 8,
"swap_space": 16,
"disable_log_stats": "",
"load_format": "dummy",
"max-model-len": 2048,

View File

@@ -5,7 +5,6 @@
"server_parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"swap_space": 16,
"disable_log_stats": "",
"load_format": "dummy"
},
@@ -23,7 +22,6 @@
"server_parameters": {
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"tensor_parallel_size": 4,
"swap_space": 16,
"disable_log_stats": "",
"load_format": "dummy"
},
@@ -41,7 +39,6 @@
"server_parameters": {
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
"tensor_parallel_size": 2,
"swap_space": 16,
"disable_log_stats": "",
"load_format": "dummy"
},
@@ -59,7 +56,6 @@
"server_parameters": {
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"tensor_parallel_size": 4,
"swap_space": 16,
"speculative_config": {
"model": "turboderp/Qwama-0.5B-Instruct",
"num_speculative_tokens": 4,

View File

@@ -145,7 +145,6 @@ def create_minimal_vllm_config(
cache_config = CacheConfig(
block_size=block_size,
gpu_memory_utilization=0.9,
swap_space=0,
cache_dtype="auto",
enable_prefix_caching=False,
)

View File

@@ -141,7 +141,6 @@ def _create_vllm_config(
cache_config = CacheConfig(
block_size=config.block_size,
cache_dtype="auto",
swap_space=0,
)
cache_config.num_gpu_blocks = max_num_blocks
cache_config.num_cpu_blocks = 0

View File

@@ -507,10 +507,10 @@ longer relevant in v1:
- `vllm:num_requests_swapped`
- `vllm:cpu_cache_usage_perc`
In this mode, when a request is preempted (e.g. to make room in KV
cache to complete other requests), we swap kv cache blocks out to CPU
memory. This is also known as "KV cache offloading" and is configured
with `--swap-space` and `--preemption-mode`.
In this mode, when a request was preempted (e.g. to make room in KV
cache to complete other requests), kv cache blocks were swapped out to
CPU memory. The `--swap-space` flag has been removed as this feature
is no longer used in V1.
Historically, [vLLM has long supported beam search](https://github.com/vllm-project/vllm/issues/6226). The
SequenceGroup encapsulated the idea of N Sequences which

View File

@@ -17,7 +17,7 @@ llm = Vllm(
model="microsoft/Orca-2-7b",
tensor_parallel_size=4,
max_new_tokens=100,
vllm_kwargs={"swap_space": 1, "gpu_memory_utilization": 0.5},
vllm_kwargs={"gpu_memory_utilization": 0.5},
)
```

View File

@@ -794,7 +794,6 @@ class VllmRunner:
tensor_parallel_size: int = 1,
block_size: int = 16 if not torch.xpu.is_available() else 64,
enable_chunked_prefill: bool | None = False,
swap_space: int = 4,
enforce_eager: bool | None = False,
# Set this to avoid hanging issue
default_torch_num_threads: int | None = None,
@@ -831,7 +830,6 @@ class VllmRunner:
trust_remote_code=trust_remote_code,
dtype=dtype,
seed=seed,
swap_space=swap_space,
enforce_eager=enforce_eager,
disable_log_stats=disable_log_stats,
tensor_parallel_size=tensor_parallel_size,

View File

@@ -22,7 +22,7 @@ prompts = [
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# set different `gpu_memory_utilization` and `swap_space` for different ranks,
# set different `gpu_memory_utilization` for different ranks,
# to test if all ranks agree on the same kv cache configuration.
llm = LLM(
model="facebook/opt-125m",
@@ -30,7 +30,6 @@ llm = LLM(
pipeline_parallel_size=int(os.getenv("PP_SIZE", 1)),
distributed_executor_backend="external_launcher",
gpu_memory_utilization=random.uniform(0.7, 0.9),
swap_space=random.randint(1, 4),
seed=0,
)

View File

@@ -28,7 +28,7 @@ if dp_size > 1:
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# set different `gpu_memory_utilization` and `swap_space` for different ranks,
# set different `gpu_memory_utilization` for different ranks,
# to test if all ranks agree on the same kv cache configuration.
llm = LLM(
model="microsoft/Phi-mini-MoE-instruct",
@@ -37,7 +37,6 @@ llm = LLM(
enable_expert_parallel=int(os.getenv("ENABLE_EP", "0")) == 1,
distributed_executor_backend="external_launcher",
gpu_memory_utilization=random.uniform(0.7, 0.9),
swap_space=random.randint(1, 4),
seed=0,
)

View File

@@ -64,7 +64,6 @@ def test_worker_apply_lora(qwen3_lora_files):
device_config=DeviceConfig("cuda"),
cache_config=CacheConfig(
block_size=16,
swap_space=0,
cache_dtype="auto",
),
lora_config=LoRAConfig(

View File

@@ -182,7 +182,6 @@ def create_vllm_config(
cache_config = CacheConfig(
block_size=block_size,
cache_dtype="auto",
swap_space=0,
)
# Set cache blocks for testing
# (these may be set during initialization normally)

View File

@@ -1776,7 +1776,6 @@ def create_scheduler_with_priority(
cache_config = CacheConfig(
block_size=block_size,
gpu_memory_utilization=0.9,
swap_space=0,
cache_dtype="auto",
enable_prefix_caching=enable_prefix_caching,
)
@@ -3726,7 +3725,6 @@ def _create_encoder_decoder_scheduler(
cache_config = CacheConfig(
block_size=block_size,
gpu_memory_utilization=0.9,
swap_space=0,
cache_dtype="auto",
enable_prefix_caching=False,
)

View File

@@ -94,7 +94,6 @@ def create_scheduler(
cache_config = CacheConfig(
block_size=block_size,
gpu_memory_utilization=0.9,
swap_space=0,
cache_dtype="auto",
enable_prefix_caching=enable_prefix_caching,
)

View File

@@ -506,7 +506,6 @@ def test_encoder_instance_zero_kv_cache(
cache_config = CacheConfig(
block_size=16,
gpu_memory_utilization=gpu_memory_utilization,
swap_space=0,
cache_dtype="auto",
enable_prefix_caching=enable_prefix_caching,
)

View File

@@ -206,7 +206,6 @@ def create_vllm_config(
cache_config = CacheConfig(
block_size=block_size,
gpu_memory_utilization=0.9,
swap_space=0,
cache_dtype="auto",
enable_prefix_caching=True,
)

View File

@@ -118,7 +118,6 @@ def create_vllm_config(
cache_config = CacheConfig(
block_size=block_size,
gpu_memory_utilization=0.9,
swap_space=0,
cache_dtype=cache_dtype,
enable_prefix_caching=True,
)

View File

@@ -96,7 +96,6 @@ def get_vllm_config():
cache_config = CacheConfig(
block_size=BLOCK_SIZE,
gpu_memory_utilization=0.9,
swap_space=0,
cache_dtype="auto",
)
parallel_config = ParallelConfig()
@@ -809,7 +808,6 @@ def test_hybrid_attention_mamba_tensor_shapes():
cache_config = CacheConfig(
block_size=BLOCK_SIZE,
gpu_memory_utilization=0.9,
swap_space=0,
cache_dtype="auto",
)
parallel_config = ParallelConfig()
@@ -1242,7 +1240,6 @@ def test_cudagraph_sizes_capped_for_mamba_cache():
cache_config = CacheConfig(
block_size=BLOCK_SIZE,
gpu_memory_utilization=0.9,
swap_space=0,
cache_dtype="auto",
)
parallel_config = ParallelConfig()

View File

@@ -1,21 +1,13 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import math
from dataclasses import field
from typing import TYPE_CHECKING, Any, Literal
from typing import Literal
from pydantic import Field, SkipValidation, field_validator
from vllm.config.utils import config
from vllm.logger import init_logger
from vllm.utils.mem_constants import GiB_bytes
from vllm.utils.mem_utils import format_gib, get_cpu_memory
if TYPE_CHECKING:
from vllm.config.parallel import ParallelConfig
else:
ParallelConfig = Any
logger = init_logger(__name__)
@@ -53,8 +45,6 @@ class CacheConfig:
not matter if you have another vLLM instance running on the same GPU. For
example, if you have two vLLM instances running on the same GPU, you can
set the GPU memory utilization to 0.5 for each instance."""
swap_space: float = Field(default=4, ge=0)
"""Size of the CPU swap space per GPU (in GiB)."""
cache_dtype: CacheDType = "auto"
"""Data type for kv cache storage. If "auto", will use model data type.
CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ROCm (AMD GPU) supports
@@ -173,7 +163,6 @@ class CacheConfig:
ignored_factors = {
# Runtime/derived knobs that don't affect compiled graph shape
"gpu_memory_utilization",
"swap_space",
"is_attention_free",
"num_gpu_blocks_override",
"enable_prefix_caching",
@@ -208,24 +197,3 @@ class CacheConfig:
"scaling factor."
)
return cache_dtype
def verify_with_parallel_config(
self,
parallel_config: ParallelConfig,
) -> None:
swap_space_bytes = math.ceil(self.swap_space * GiB_bytes)
total_cpu_memory = get_cpu_memory()
# FIXME(woosuk): Here, it is assumed that the GPUs in a tensor parallel
# group are in the same node. However, the GPUs may span multiple nodes.
num_gpus_per_node = parallel_config.tensor_parallel_size
cpu_memory_usage = swap_space_bytes * num_gpus_per_node
msg = (
f"{format_gib(cpu_memory_usage)} GiB out of the "
f"{format_gib(total_cpu_memory)} GiB total CPU memory "
"is allocated for the swap space."
)
if cpu_memory_usage > 0.7 * total_cpu_memory:
raise ValueError("Too large swap space. " + msg)
elif cpu_memory_usage > 0.4 * total_cpu_memory:
logger.warning("Possibly too large swap space. %s", msg)

View File

@@ -674,8 +674,6 @@ class VllmConfig:
self.parallel_config.is_moe_model = self.model_config.is_moe
self.cache_config.verify_with_parallel_config(self.parallel_config)
if self.lora_config is not None:
self.lora_config.verify_with_model_config(self.model_config)

View File

@@ -447,7 +447,6 @@ class EngineArgs:
)
disable_sliding_window: bool = ModelConfig.disable_sliding_window
disable_cascade_attn: bool = ModelConfig.disable_cascade_attn
swap_space: float = CacheConfig.swap_space
offload_backend: str = OffloadConfig.offload_backend
cpu_offload_gb: float = UVAOffloadConfig.cpu_offload_gb
cpu_offload_params: set[str] = get_field(UVAOffloadConfig, "cpu_offload_params")
@@ -961,7 +960,6 @@ class EngineArgs:
cache_group.add_argument(
"--kv-cache-memory-bytes", **cache_kwargs["kv_cache_memory_bytes"]
)
cache_group.add_argument("--swap-space", **cache_kwargs["swap_space"])
cache_group.add_argument("--kv-cache-dtype", **cache_kwargs["cache_dtype"])
cache_group.add_argument(
"--num-gpu-blocks-override", **cache_kwargs["num_gpu_blocks_override"]
@@ -1526,7 +1524,6 @@ class EngineArgs:
block_size=self.block_size,
gpu_memory_utilization=self.gpu_memory_utilization,
kv_cache_memory_bytes=self.kv_cache_memory_bytes,
swap_space=self.swap_space,
cache_dtype=resolved_cache_dtype, # type: ignore[arg-type]
is_attention_free=model_config.is_attention_free,
num_gpu_blocks_override=self.num_gpu_blocks_override,

View File

@@ -164,12 +164,6 @@ class LLM:
compared with using gpu_memory_utilization. Note that
kv_cache_memory_bytes (when not-None) ignores
gpu_memory_utilization
swap_space: The size (GiB) of CPU memory per GPU to use as swap space.
This can be used for temporarily storing the states of the requests
when their `best_of` sampling parameters are larger than 1. If all
requests will have `best_of=1`, you can safely set this to 0.
Noting that `best_of` is only supported in V0. Otherwise, too small
values may cause out-of-memory (OOM) errors.
cpu_offload_gb: The size (GiB) of CPU memory to use for offloading
the model weights. This virtually increases the GPU memory space
you can use to hold the model weights, at the cost of CPU-GPU data
@@ -240,7 +234,6 @@ class LLM:
chat_template: Path | str | None = None,
seed: int = 0,
gpu_memory_utilization: float = 0.9,
swap_space: float = 4,
cpu_offload_gb: float = 0,
offload_group_size: int = 0,
offload_num_in_group: int = 1,
@@ -265,6 +258,17 @@ class LLM:
) -> None:
"""LLM constructor."""
if "swap_space" in kwargs:
kwargs.pop("swap_space")
import warnings
warnings.warn(
"The 'swap_space' parameter is deprecated and ignored. "
"It will be removed in a future version.",
DeprecationWarning,
stacklevel=2,
)
if "disable_log_stats" not in kwargs:
kwargs["disable_log_stats"] = True
@@ -353,7 +357,6 @@ class LLM:
seed=seed,
gpu_memory_utilization=gpu_memory_utilization,
kv_cache_memory_bytes=kv_cache_memory_bytes,
swap_space=swap_space,
cpu_offload_gb=cpu_offload_gb,
offload_group_size=offload_group_size,
offload_num_in_group=offload_num_in_group,