[V0 Deprecation] Remove unused swap_space parameter (#36216)
Signed-off-by: majiayu000 <1835304752@qq.com> Co-authored-by: mcelrath
This commit is contained in:
@@ -83,7 +83,6 @@ We test the throughput by using `vllm bench serve` with request rate = inf to co
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3-8B",
|
||||
"tensor_parallel_size": 1,
|
||||
"swap_space": 16,
|
||||
"disable_log_stats": "",
|
||||
"load_format": "dummy"
|
||||
},
|
||||
|
||||
@@ -10,7 +10,6 @@
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 1,
|
||||
"swap_space": 16,
|
||||
"disable_log_stats": "",
|
||||
"load_format": "dummy",
|
||||
"max-model-len": 2048,
|
||||
@@ -37,7 +36,6 @@
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||
"tensor_parallel_size": 4,
|
||||
"swap_space": 16,
|
||||
"disable_log_stats": "",
|
||||
"load_format": "dummy",
|
||||
"max-model-len": 2048,
|
||||
@@ -64,7 +62,6 @@
|
||||
"server_parameters": {
|
||||
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||
"tensor_parallel_size": 2,
|
||||
"swap_space": 16,
|
||||
"disable_log_stats": "",
|
||||
"load_format": "dummy",
|
||||
"max-model-len": 2048,
|
||||
@@ -91,7 +88,6 @@
|
||||
"server_parameters": {
|
||||
"model": "deepseek-ai/DeepSeek-R1",
|
||||
"tensor_parallel_size": 8,
|
||||
"swap_space": 16,
|
||||
"disable_log_stats": "",
|
||||
"load_format": "dummy",
|
||||
"max-model-len": 2048,
|
||||
|
||||
@@ -5,7 +5,6 @@
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 1,
|
||||
"swap_space": 16,
|
||||
"disable_log_stats": "",
|
||||
"load_format": "dummy"
|
||||
},
|
||||
@@ -23,7 +22,6 @@
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||
"tensor_parallel_size": 4,
|
||||
"swap_space": 16,
|
||||
"disable_log_stats": "",
|
||||
"load_format": "dummy"
|
||||
},
|
||||
@@ -41,7 +39,6 @@
|
||||
"server_parameters": {
|
||||
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||
"tensor_parallel_size": 2,
|
||||
"swap_space": 16,
|
||||
"disable_log_stats": "",
|
||||
"load_format": "dummy"
|
||||
},
|
||||
@@ -59,7 +56,6 @@
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||
"tensor_parallel_size": 4,
|
||||
"swap_space": 16,
|
||||
"speculative_config": {
|
||||
"model": "turboderp/Qwama-0.5B-Instruct",
|
||||
"num_speculative_tokens": 4,
|
||||
|
||||
@@ -145,7 +145,6 @@ def create_minimal_vllm_config(
|
||||
cache_config = CacheConfig(
|
||||
block_size=block_size,
|
||||
gpu_memory_utilization=0.9,
|
||||
swap_space=0,
|
||||
cache_dtype="auto",
|
||||
enable_prefix_caching=False,
|
||||
)
|
||||
|
||||
@@ -141,7 +141,6 @@ def _create_vllm_config(
|
||||
cache_config = CacheConfig(
|
||||
block_size=config.block_size,
|
||||
cache_dtype="auto",
|
||||
swap_space=0,
|
||||
)
|
||||
cache_config.num_gpu_blocks = max_num_blocks
|
||||
cache_config.num_cpu_blocks = 0
|
||||
|
||||
@@ -507,10 +507,10 @@ longer relevant in v1:
|
||||
- `vllm:num_requests_swapped`
|
||||
- `vllm:cpu_cache_usage_perc`
|
||||
|
||||
In this mode, when a request is preempted (e.g. to make room in KV
|
||||
cache to complete other requests), we swap kv cache blocks out to CPU
|
||||
memory. This is also known as "KV cache offloading" and is configured
|
||||
with `--swap-space` and `--preemption-mode`.
|
||||
In this mode, when a request was preempted (e.g. to make room in KV
|
||||
cache to complete other requests), kv cache blocks were swapped out to
|
||||
CPU memory. The `--swap-space` flag has been removed as this feature
|
||||
is no longer used in V1.
|
||||
|
||||
Historically, [vLLM has long supported beam search](https://github.com/vllm-project/vllm/issues/6226). The
|
||||
SequenceGroup encapsulated the idea of N Sequences which
|
||||
|
||||
@@ -17,7 +17,7 @@ llm = Vllm(
|
||||
model="microsoft/Orca-2-7b",
|
||||
tensor_parallel_size=4,
|
||||
max_new_tokens=100,
|
||||
vllm_kwargs={"swap_space": 1, "gpu_memory_utilization": 0.5},
|
||||
vllm_kwargs={"gpu_memory_utilization": 0.5},
|
||||
)
|
||||
```
|
||||
|
||||
|
||||
@@ -794,7 +794,6 @@ class VllmRunner:
|
||||
tensor_parallel_size: int = 1,
|
||||
block_size: int = 16 if not torch.xpu.is_available() else 64,
|
||||
enable_chunked_prefill: bool | None = False,
|
||||
swap_space: int = 4,
|
||||
enforce_eager: bool | None = False,
|
||||
# Set this to avoid hanging issue
|
||||
default_torch_num_threads: int | None = None,
|
||||
@@ -831,7 +830,6 @@ class VllmRunner:
|
||||
trust_remote_code=trust_remote_code,
|
||||
dtype=dtype,
|
||||
seed=seed,
|
||||
swap_space=swap_space,
|
||||
enforce_eager=enforce_eager,
|
||||
disable_log_stats=disable_log_stats,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
|
||||
@@ -22,7 +22,7 @@ prompts = [
|
||||
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
|
||||
# set different `gpu_memory_utilization` and `swap_space` for different ranks,
|
||||
# set different `gpu_memory_utilization` for different ranks,
|
||||
# to test if all ranks agree on the same kv cache configuration.
|
||||
llm = LLM(
|
||||
model="facebook/opt-125m",
|
||||
@@ -30,7 +30,6 @@ llm = LLM(
|
||||
pipeline_parallel_size=int(os.getenv("PP_SIZE", 1)),
|
||||
distributed_executor_backend="external_launcher",
|
||||
gpu_memory_utilization=random.uniform(0.7, 0.9),
|
||||
swap_space=random.randint(1, 4),
|
||||
seed=0,
|
||||
)
|
||||
|
||||
|
||||
@@ -28,7 +28,7 @@ if dp_size > 1:
|
||||
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
|
||||
# set different `gpu_memory_utilization` and `swap_space` for different ranks,
|
||||
# set different `gpu_memory_utilization` for different ranks,
|
||||
# to test if all ranks agree on the same kv cache configuration.
|
||||
llm = LLM(
|
||||
model="microsoft/Phi-mini-MoE-instruct",
|
||||
@@ -37,7 +37,6 @@ llm = LLM(
|
||||
enable_expert_parallel=int(os.getenv("ENABLE_EP", "0")) == 1,
|
||||
distributed_executor_backend="external_launcher",
|
||||
gpu_memory_utilization=random.uniform(0.7, 0.9),
|
||||
swap_space=random.randint(1, 4),
|
||||
seed=0,
|
||||
)
|
||||
|
||||
|
||||
@@ -64,7 +64,6 @@ def test_worker_apply_lora(qwen3_lora_files):
|
||||
device_config=DeviceConfig("cuda"),
|
||||
cache_config=CacheConfig(
|
||||
block_size=16,
|
||||
swap_space=0,
|
||||
cache_dtype="auto",
|
||||
),
|
||||
lora_config=LoRAConfig(
|
||||
|
||||
@@ -182,7 +182,6 @@ def create_vllm_config(
|
||||
cache_config = CacheConfig(
|
||||
block_size=block_size,
|
||||
cache_dtype="auto",
|
||||
swap_space=0,
|
||||
)
|
||||
# Set cache blocks for testing
|
||||
# (these may be set during initialization normally)
|
||||
|
||||
@@ -1776,7 +1776,6 @@ def create_scheduler_with_priority(
|
||||
cache_config = CacheConfig(
|
||||
block_size=block_size,
|
||||
gpu_memory_utilization=0.9,
|
||||
swap_space=0,
|
||||
cache_dtype="auto",
|
||||
enable_prefix_caching=enable_prefix_caching,
|
||||
)
|
||||
@@ -3726,7 +3725,6 @@ def _create_encoder_decoder_scheduler(
|
||||
cache_config = CacheConfig(
|
||||
block_size=block_size,
|
||||
gpu_memory_utilization=0.9,
|
||||
swap_space=0,
|
||||
cache_dtype="auto",
|
||||
enable_prefix_caching=False,
|
||||
)
|
||||
|
||||
@@ -94,7 +94,6 @@ def create_scheduler(
|
||||
cache_config = CacheConfig(
|
||||
block_size=block_size,
|
||||
gpu_memory_utilization=0.9,
|
||||
swap_space=0,
|
||||
cache_dtype="auto",
|
||||
enable_prefix_caching=enable_prefix_caching,
|
||||
)
|
||||
|
||||
@@ -506,7 +506,6 @@ def test_encoder_instance_zero_kv_cache(
|
||||
cache_config = CacheConfig(
|
||||
block_size=16,
|
||||
gpu_memory_utilization=gpu_memory_utilization,
|
||||
swap_space=0,
|
||||
cache_dtype="auto",
|
||||
enable_prefix_caching=enable_prefix_caching,
|
||||
)
|
||||
|
||||
@@ -206,7 +206,6 @@ def create_vllm_config(
|
||||
cache_config = CacheConfig(
|
||||
block_size=block_size,
|
||||
gpu_memory_utilization=0.9,
|
||||
swap_space=0,
|
||||
cache_dtype="auto",
|
||||
enable_prefix_caching=True,
|
||||
)
|
||||
|
||||
@@ -118,7 +118,6 @@ def create_vllm_config(
|
||||
cache_config = CacheConfig(
|
||||
block_size=block_size,
|
||||
gpu_memory_utilization=0.9,
|
||||
swap_space=0,
|
||||
cache_dtype=cache_dtype,
|
||||
enable_prefix_caching=True,
|
||||
)
|
||||
|
||||
@@ -96,7 +96,6 @@ def get_vllm_config():
|
||||
cache_config = CacheConfig(
|
||||
block_size=BLOCK_SIZE,
|
||||
gpu_memory_utilization=0.9,
|
||||
swap_space=0,
|
||||
cache_dtype="auto",
|
||||
)
|
||||
parallel_config = ParallelConfig()
|
||||
@@ -809,7 +808,6 @@ def test_hybrid_attention_mamba_tensor_shapes():
|
||||
cache_config = CacheConfig(
|
||||
block_size=BLOCK_SIZE,
|
||||
gpu_memory_utilization=0.9,
|
||||
swap_space=0,
|
||||
cache_dtype="auto",
|
||||
)
|
||||
parallel_config = ParallelConfig()
|
||||
@@ -1242,7 +1240,6 @@ def test_cudagraph_sizes_capped_for_mamba_cache():
|
||||
cache_config = CacheConfig(
|
||||
block_size=BLOCK_SIZE,
|
||||
gpu_memory_utilization=0.9,
|
||||
swap_space=0,
|
||||
cache_dtype="auto",
|
||||
)
|
||||
parallel_config = ParallelConfig()
|
||||
|
||||
@@ -1,21 +1,13 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import math
|
||||
from dataclasses import field
|
||||
from typing import TYPE_CHECKING, Any, Literal
|
||||
from typing import Literal
|
||||
|
||||
from pydantic import Field, SkipValidation, field_validator
|
||||
|
||||
from vllm.config.utils import config
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils.mem_constants import GiB_bytes
|
||||
from vllm.utils.mem_utils import format_gib, get_cpu_memory
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.config.parallel import ParallelConfig
|
||||
else:
|
||||
ParallelConfig = Any
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
@@ -53,8 +45,6 @@ class CacheConfig:
|
||||
not matter if you have another vLLM instance running on the same GPU. For
|
||||
example, if you have two vLLM instances running on the same GPU, you can
|
||||
set the GPU memory utilization to 0.5 for each instance."""
|
||||
swap_space: float = Field(default=4, ge=0)
|
||||
"""Size of the CPU swap space per GPU (in GiB)."""
|
||||
cache_dtype: CacheDType = "auto"
|
||||
"""Data type for kv cache storage. If "auto", will use model data type.
|
||||
CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ROCm (AMD GPU) supports
|
||||
@@ -173,7 +163,6 @@ class CacheConfig:
|
||||
ignored_factors = {
|
||||
# Runtime/derived knobs that don't affect compiled graph shape
|
||||
"gpu_memory_utilization",
|
||||
"swap_space",
|
||||
"is_attention_free",
|
||||
"num_gpu_blocks_override",
|
||||
"enable_prefix_caching",
|
||||
@@ -208,24 +197,3 @@ class CacheConfig:
|
||||
"scaling factor."
|
||||
)
|
||||
return cache_dtype
|
||||
|
||||
def verify_with_parallel_config(
|
||||
self,
|
||||
parallel_config: ParallelConfig,
|
||||
) -> None:
|
||||
swap_space_bytes = math.ceil(self.swap_space * GiB_bytes)
|
||||
total_cpu_memory = get_cpu_memory()
|
||||
# FIXME(woosuk): Here, it is assumed that the GPUs in a tensor parallel
|
||||
# group are in the same node. However, the GPUs may span multiple nodes.
|
||||
num_gpus_per_node = parallel_config.tensor_parallel_size
|
||||
cpu_memory_usage = swap_space_bytes * num_gpus_per_node
|
||||
|
||||
msg = (
|
||||
f"{format_gib(cpu_memory_usage)} GiB out of the "
|
||||
f"{format_gib(total_cpu_memory)} GiB total CPU memory "
|
||||
"is allocated for the swap space."
|
||||
)
|
||||
if cpu_memory_usage > 0.7 * total_cpu_memory:
|
||||
raise ValueError("Too large swap space. " + msg)
|
||||
elif cpu_memory_usage > 0.4 * total_cpu_memory:
|
||||
logger.warning("Possibly too large swap space. %s", msg)
|
||||
|
||||
@@ -674,8 +674,6 @@ class VllmConfig:
|
||||
|
||||
self.parallel_config.is_moe_model = self.model_config.is_moe
|
||||
|
||||
self.cache_config.verify_with_parallel_config(self.parallel_config)
|
||||
|
||||
if self.lora_config is not None:
|
||||
self.lora_config.verify_with_model_config(self.model_config)
|
||||
|
||||
|
||||
@@ -447,7 +447,6 @@ class EngineArgs:
|
||||
)
|
||||
disable_sliding_window: bool = ModelConfig.disable_sliding_window
|
||||
disable_cascade_attn: bool = ModelConfig.disable_cascade_attn
|
||||
swap_space: float = CacheConfig.swap_space
|
||||
offload_backend: str = OffloadConfig.offload_backend
|
||||
cpu_offload_gb: float = UVAOffloadConfig.cpu_offload_gb
|
||||
cpu_offload_params: set[str] = get_field(UVAOffloadConfig, "cpu_offload_params")
|
||||
@@ -961,7 +960,6 @@ class EngineArgs:
|
||||
cache_group.add_argument(
|
||||
"--kv-cache-memory-bytes", **cache_kwargs["kv_cache_memory_bytes"]
|
||||
)
|
||||
cache_group.add_argument("--swap-space", **cache_kwargs["swap_space"])
|
||||
cache_group.add_argument("--kv-cache-dtype", **cache_kwargs["cache_dtype"])
|
||||
cache_group.add_argument(
|
||||
"--num-gpu-blocks-override", **cache_kwargs["num_gpu_blocks_override"]
|
||||
@@ -1526,7 +1524,6 @@ class EngineArgs:
|
||||
block_size=self.block_size,
|
||||
gpu_memory_utilization=self.gpu_memory_utilization,
|
||||
kv_cache_memory_bytes=self.kv_cache_memory_bytes,
|
||||
swap_space=self.swap_space,
|
||||
cache_dtype=resolved_cache_dtype, # type: ignore[arg-type]
|
||||
is_attention_free=model_config.is_attention_free,
|
||||
num_gpu_blocks_override=self.num_gpu_blocks_override,
|
||||
|
||||
@@ -164,12 +164,6 @@ class LLM:
|
||||
compared with using gpu_memory_utilization. Note that
|
||||
kv_cache_memory_bytes (when not-None) ignores
|
||||
gpu_memory_utilization
|
||||
swap_space: The size (GiB) of CPU memory per GPU to use as swap space.
|
||||
This can be used for temporarily storing the states of the requests
|
||||
when their `best_of` sampling parameters are larger than 1. If all
|
||||
requests will have `best_of=1`, you can safely set this to 0.
|
||||
Noting that `best_of` is only supported in V0. Otherwise, too small
|
||||
values may cause out-of-memory (OOM) errors.
|
||||
cpu_offload_gb: The size (GiB) of CPU memory to use for offloading
|
||||
the model weights. This virtually increases the GPU memory space
|
||||
you can use to hold the model weights, at the cost of CPU-GPU data
|
||||
@@ -240,7 +234,6 @@ class LLM:
|
||||
chat_template: Path | str | None = None,
|
||||
seed: int = 0,
|
||||
gpu_memory_utilization: float = 0.9,
|
||||
swap_space: float = 4,
|
||||
cpu_offload_gb: float = 0,
|
||||
offload_group_size: int = 0,
|
||||
offload_num_in_group: int = 1,
|
||||
@@ -265,6 +258,17 @@ class LLM:
|
||||
) -> None:
|
||||
"""LLM constructor."""
|
||||
|
||||
if "swap_space" in kwargs:
|
||||
kwargs.pop("swap_space")
|
||||
import warnings
|
||||
|
||||
warnings.warn(
|
||||
"The 'swap_space' parameter is deprecated and ignored. "
|
||||
"It will be removed in a future version.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
if "disable_log_stats" not in kwargs:
|
||||
kwargs["disable_log_stats"] = True
|
||||
|
||||
@@ -353,7 +357,6 @@ class LLM:
|
||||
seed=seed,
|
||||
gpu_memory_utilization=gpu_memory_utilization,
|
||||
kv_cache_memory_bytes=kv_cache_memory_bytes,
|
||||
swap_space=swap_space,
|
||||
cpu_offload_gb=cpu_offload_gb,
|
||||
offload_group_size=offload_group_size,
|
||||
offload_num_in_group=offload_num_in_group,
|
||||
|
||||
Reference in New Issue
Block a user