diff --git a/tests/v1/kv_connector/unit/test_config.py b/tests/v1/kv_connector/unit/test_config.py index 74075f3ee..8a547c3f0 100644 --- a/tests/v1/kv_connector/unit/test_config.py +++ b/tests/v1/kv_connector/unit/test_config.py @@ -19,7 +19,8 @@ pytestmark = pytest.mark.cpu_test ("lmcache", 4.0, 1, 1, "LMCacheConnectorV1", 4.0), # size per rank: 8.0 GiB / (2 * 2) = 2.0 GiB ("lmcache", 8.0, 2, 2, "LMCacheConnectorV1", 2.0), - (None, None, 1, 1, None, None), + # When kv_offloading_size is None, offloading is disabled (backend is ignored) + ("native", None, 1, 1, None, None), ], ) def test_kv_connector( @@ -62,3 +63,19 @@ def test_kv_connector( assert kv_connector_extra_config["lmcache.max_local_cpu_size"] == expected_bytes # Existing config should be replaced assert "existing_key" not in kv_connector_extra_config + + +def test_kv_offloading_size_only_uses_native_default(): + """Test that setting only kv_offloading_size enables native offloading.""" + vllm_config = VllmConfig( + cache_config=CacheConfig( + kv_offloading_size=4.0, + # kv_offloading_backend not set, should default to "native" + ), + ) + + kv_transfer_config = vllm_config.kv_transfer_config + kv_connector_extra_config = kv_transfer_config.kv_connector_extra_config + assert kv_transfer_config.kv_connector == "OffloadingConnector" + assert kv_transfer_config.kv_role == "kv_both" + assert kv_connector_extra_config["cpu_bytes_to_use"] == 4.0 * (1 << 30) diff --git a/vllm/config/cache.py b/vllm/config/cache.py index 318efc82a..839ea4780 100644 --- a/vllm/config/cache.py +++ b/vllm/config/cache.py @@ -152,13 +152,13 @@ class CacheConfig: kv_offloading_size: float | None = None """Size of the KV cache offloading buffer in GiB. When TP > 1, this is the total buffer size summed across all TP ranks. By default, this is set - to None, which means no KV offloading is enabled. When set with - kv_offloading_backend, vLLM will enable KV cache offloading to CPU""" + to None, which means no KV offloading is enabled. When set, vLLM will + enable KV cache offloading to CPU using the kv_offloading_backend.""" - kv_offloading_backend: KVOffloadingBackend | None = None + kv_offloading_backend: KVOffloadingBackend = "native" """The backend to use for KV cache offloading. Supported backends include - 'native' (vLLM native CPU offloading), 'lmcache' This option must be used - together with kv_offloading_size.""" + 'native' (vLLM native CPU offloading), 'lmcache'. + KV offloading is only activated when kv_offloading_size is set.""" def compute_hash(self) -> str: """ diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index a84acd8e6..ec699b629 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -498,17 +498,15 @@ class VllmConfig: Right now, this function reads the offloading settings from CacheConfig and configures the KVTransferConfig accordingly. """ - if (kv_offloading_backend := self.cache_config.kv_offloading_backend) is None: + # KV offloading is only activated when kv_offloading_size is set. + if (kv_offloading_size := self.cache_config.kv_offloading_size) is None: return + kv_offloading_backend = self.cache_config.kv_offloading_backend + # If no KVTransferConfig is provided, create a default one. if self.kv_transfer_config is None: self.kv_transfer_config = KVTransferConfig() - - if (kv_offloading_size := self.cache_config.kv_offloading_size) is None: - raise ValueError( - "You must set kv_offloading_size when kv_offloading_backend is set." - ) num_kv_ranks = ( self.parallel_config.tensor_parallel_size * self.parallel_config.pipeline_parallel_size diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index b7f3969ee..3a36ca797 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -578,9 +578,7 @@ class EngineArgs: optimization_level: OptimizationLevel = VllmConfig.optimization_level kv_offloading_size: float | None = CacheConfig.kv_offloading_size - kv_offloading_backend: KVOffloadingBackend | None = ( - CacheConfig.kv_offloading_backend - ) + kv_offloading_backend: KVOffloadingBackend = CacheConfig.kv_offloading_backend tokens_only: bool = False def __post_init__(self):